From 5ca03c299dc78f9fb08fc96c01ac0fa0dd800a15 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:27:15 -0500 Subject: [PATCH 1/3] Major documentation rewrite and reorg. --- docs/api/index.rst | 12 - docs/conf.py | 3 +- .../aggregation-operators.rst | 226 ++++----- .../distance-operators.rst | 278 ++++++----- docs/{operators => dialect}/index.rst | 17 +- docs/{operators => dialect}/quantifiers.rst | 158 +++---- .../spatial-operators.rst | 102 ++-- .../syntax-reference.rst | 60 +-- docs/guides/engine.rst | 195 ++++++++ docs/guides/index.rst | 18 +- docs/guides/multi-backend.rst | 367 --------------- docs/guides/performance.rst | 297 ++++-------- docs/guides/quickstart.rst | 175 +++++++ docs/guides/schema-mapping.rst | 436 +++++------------- docs/guides/transpilation.rst | 417 ----------------- docs/index.rst | 151 ++---- docs/quickstart.rst | 228 --------- docs/recipes/advanced-queries.rst | 431 ++++++++--------- docs/recipes/bedtools-migration.rst | 398 +++++++--------- docs/recipes/clustering-queries.rst | 424 ++++++++--------- docs/recipes/distance-queries.rst | 396 ++++++++-------- docs/recipes/index.rst | 37 +- docs/recipes/intersect-queries.rst | 306 ++++++------ docs/transpilation/api-reference.rst | 13 + docs/transpilation/execution.rst | 152 ++++++ docs/transpilation/index.rst | 210 +++++++++ src/giql/__init__.py | 7 - 27 files changed, 2204 insertions(+), 3310 deletions(-) delete mode 100644 docs/api/index.rst rename docs/{operators => dialect}/aggregation-operators.rst (63%) rename docs/{operators => dialect}/distance-operators.rst (55%) rename docs/{operators => dialect}/index.rst (89%) rename docs/{operators => dialect}/quantifiers.rst (61%) rename docs/{operators => dialect}/spatial-operators.rst (75%) rename docs/{reference => dialect}/syntax-reference.rst (78%) create mode 100644 docs/guides/engine.rst delete mode 100644 docs/guides/multi-backend.rst create mode 100644 docs/guides/quickstart.rst delete mode 100644 docs/guides/transpilation.rst delete mode 100644 docs/quickstart.rst create mode 100644 docs/transpilation/api-reference.rst create mode 100644 docs/transpilation/execution.rst create mode 100644 docs/transpilation/index.rst diff --git a/docs/api/index.rst b/docs/api/index.rst deleted file mode 100644 index a17dc9e..0000000 --- a/docs/api/index.rst +++ /dev/null @@ -1,12 +0,0 @@ -API Reference -============= - -This section documents the GIQL Python API. - -.. toctree:: - :maxdepth: 2 - -.. automodule:: giql - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 1d38676..9a28ad8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,6 +25,7 @@ "sphinx.ext.viewcode", "sphinx.ext.intersphinx", "sphinx.ext.autosummary", + "sphinx_design", ] # Napoleon settings @@ -69,5 +70,5 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "sphinx_rtd_theme" +html_theme = "sphinx_book_theme" # html_static_path = ['_static'] # Uncomment when you have custom static files diff --git a/docs/operators/aggregation-operators.rst b/docs/dialect/aggregation-operators.rst similarity index 63% rename from docs/operators/aggregation-operators.rst rename to docs/dialect/aggregation-operators.rst index 50d10da..cc3d5ec 100644 --- a/docs/operators/aggregation-operators.rst +++ b/docs/dialect/aggregation-operators.rst @@ -1,5 +1,5 @@ -Aggregation Operators -===================== +Aggregation +=========== Aggregation operators combine and cluster genomic intervals. These operators are essential for reducing complex interval data into summarized regions, such as @@ -7,7 +7,7 @@ merging overlapping peaks or identifying clusters of related features. .. contents:: :local: - :depth: 2 + :depth: 1 .. _cluster-operator: @@ -51,7 +51,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **distance** *(optional)* Maximum gap between intervals to consider them part of the same cluster. @@ -73,91 +73,81 @@ Examples Assign cluster IDs to overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start **Distance-Based Clustering:** Cluster intervals within 1000bp of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start **Strand-Specific Clustering:** Cluster intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Analyze Cluster Statistics:** Count features per cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ) + WITH clustered AS ( SELECT - chromosome, - cluster_id, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end - FROM clustered - GROUP BY chromosome, cluster_id - ORDER BY chromosome, cluster_start - """) + *, + CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + chrom, + cluster_id, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end + FROM clustered + GROUP BY chrom, cluster_id + ORDER BY chrom, cluster_start **Filter by Cluster Size:** Find regions with multiple overlapping features: -.. code-block:: python - - cursor = engine.execute(""" - WITH clustered AS ( - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ), - cluster_sizes AS ( - SELECT cluster_id, COUNT(*) AS size - FROM clustered - GROUP BY cluster_id - ) - SELECT c.* - FROM clustered c - INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id - WHERE s.size >= 3 - """) +.. code-block:: sql + + WITH clustered AS ( + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -239,7 +229,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **distance** *(optional)* Maximum gap between intervals to merge. Default: ``0`` (only overlapping @@ -253,9 +243,9 @@ Return Value Returns merged interval coordinates: -- ``chromosome`` - Chromosome of the merged region -- ``start_pos`` - Start position of the merged region -- ``end_pos`` - End position of the merged region +- ``chrom`` - Chromosome of the merged region +- ``start`` - Start position of the merged region +- ``end`` - End position of the merged region - ``strand`` - Strand (if ``stranded=true``) Examples @@ -265,108 +255,92 @@ Examples Merge all overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features - # Returns: chromosome, start_pos, end_pos for each merged region + -- Returns: chrom, start, end for each merged region **Distance-Based Merge:** Merge intervals within 1000bp of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features **Strand-Specific Merge:** Merge intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features **Merge with Feature Count:** Count how many features were merged into each region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features **Merge with Aggregations:** Calculate statistics for merged regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count, - AVG(score) AS avg_score, - MAX(score) AS max_score - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM features **Collect Merged Feature Names:** List the names of features that were merged: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS feature_names - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features **Merge by Chromosome:** Process each chromosome separately (explicit grouping): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - chromosome, - MERGE(interval), - COUNT(*) AS feature_count - FROM features - GROUP BY chromosome - ORDER BY chromosome - """) + SELECT + chrom, + MERGE(interval), + COUNT(*) AS feature_count + FROM features + GROUP BY chrom + ORDER BY chrom **Calculate Total Coverage:** Calculate the total base pairs covered after merging: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) AS merged_pos - FROM features - ) - SELECT SUM(end_pos - start_pos) AS total_coverage - FROM merged - """) + WITH merged AS ( + SELECT MERGE(interval) AS merged_pos + FROM features + ) + SELECT SUM(end - start) AS total_coverage + FROM merged Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/operators/distance-operators.rst b/docs/dialect/distance-operators.rst similarity index 55% rename from docs/operators/distance-operators.rst rename to docs/dialect/distance-operators.rst index 7ceccf3..216bdcb 100644 --- a/docs/operators/distance-operators.rst +++ b/docs/dialect/distance-operators.rst @@ -1,5 +1,5 @@ -Distance and Proximity Operators -================================ +Distance and Proximity +====================== Distance and proximity operators calculate genomic distances and find nearest features. These operators are essential for proximity analysis, such as finding genes near @@ -7,7 +7,7 @@ regulatory elements or variants near transcription start sites. .. contents:: :local: - :depth: 2 + :depth: 1 .. _distance-operator: @@ -37,7 +37,7 @@ Parameters ~~~~~~~~~~ **interval_a** - A genomic column registered with the engine. + A genomic column. **interval_b** Another genomic column to measure distance to. @@ -56,52 +56,46 @@ Examples Calculate distance between peaks and genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - p.name AS peak, - g.name AS gene, - DISTANCE(p.interval, g.interval) AS distance - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - ORDER BY p.name, distance - """) + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS distance + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom + ORDER BY p.name, distance **Filter by Distance:** Find features within 10kb of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - AND DISTANCE(a.interval, b.interval) <= 10000 - """) + SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + AND DISTANCE(a.interval, b.interval) <= 10000 **Identify Overlapping vs. Proximal:** Distinguish between overlapping and nearby features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - p.name, - g.name, - CASE - WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' - WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal' - ELSE 'distant' - END AS relationship - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - """) +.. code-block:: sql + + SELECT + p.name, + g.name, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -126,7 +120,7 @@ Backend Compatibility Performance Notes ~~~~~~~~~~~~~~~~~ -- Always include ``WHERE a.chromosome = b.chromosome`` to avoid unnecessary +- Always include ``WHERE a.chrom = b.chrom`` to avoid unnecessary cross-chromosome comparisons - For large datasets, consider pre-filtering by region before calculating distances - Create indexes on chromosome and position columns for better performance @@ -219,136 +213,124 @@ Examples Find the 3 nearest genes for each peak: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance **Standalone Query:** Find 5 nearest genes to a specific genomic location: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT gene_name, distance - FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) - ORDER BY distance - """) + SELECT gene_name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance **Distance-Constrained Search:** Find nearest features within 100kb: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance **Strand-Specific Nearest Neighbors:** Find nearest same-strand features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.strand, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance **Directional (Upstream/Downstream) Queries:** Find upstream features using signed distances: -.. code-block:: python - - # Upstream features have negative distances - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance < 0 - ORDER BY peaks.name, nearest.distance DESC - """) - - # Downstream features have positive distances - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance > 0 - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + -- Upstream features have negative distances + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC + +.. code-block:: sql + + -- Downstream features have positive distances + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance **Combined Parameters:** Find nearby same-strand features within distance constraints: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=50000, - stranded=true, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -10000 AND 10000 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -376,15 +358,13 @@ Performance Notes - **Chromosome pre-filtering**: NEAREST automatically filters by chromosome for efficiency - **Use max_distance**: Specifying a maximum distance reduces the search space significantly - **Limit k**: Only request as many neighbors as you actually need -- **Create indexes**: Add indexes on ``(chromosome, start_pos, end_pos)`` for better performance +- **Create indexes**: Add indexes on ``(chrom, start, "end")`` for better performance -.. code-block:: python +.. code-block:: sql - # Create indexes for better NEAREST performance - engine.conn.execute(""" - CREATE INDEX idx_genes_position - ON genes (chromosome, start_pos, end_pos) - """) + -- Create indexes for better NEAREST performance + CREATE INDEX idx_genes_position + ON genes (chrom, start, "end") Related Operators ~~~~~~~~~~~~~~~~~ diff --git a/docs/operators/index.rst b/docs/dialect/index.rst similarity index 89% rename from docs/operators/index.rst rename to docs/dialect/index.rst index ce24f17..48e7bb2 100644 --- a/docs/operators/index.rst +++ b/docs/dialect/index.rst @@ -1,15 +1,13 @@ -GIQL Operators -============== +Operators +========= GIQL extends SQL with operators specifically designed for genomic interval queries. These operators enable powerful spatial reasoning over genomic coordinates without requiring complex SQL expressions. -Operators are organized by functionality: - -.. contents:: - :local: - :depth: 1 +Operators are organized by functionality. All operators work across supported +database backends (DuckDB, SQLite, with PostgreSQL planned). Each operator page +includes a compatibility table showing backend support status. Spatial Relationship Operators ------------------------------ @@ -98,11 +96,6 @@ Apply operators to multiple ranges simultaneously. See :doc:`quantifiers` for detailed documentation. -Operator Compatibility ----------------------- - -All operators work across supported database backends (DuckDB, SQLite, with PostgreSQL planned). -Each operator page includes a compatibility table showing backend support status. .. toctree:: :maxdepth: 2 diff --git a/docs/operators/quantifiers.rst b/docs/dialect/quantifiers.rst similarity index 61% rename from docs/operators/quantifiers.rst rename to docs/dialect/quantifiers.rst index cffb71d..b10a38b 100644 --- a/docs/operators/quantifiers.rst +++ b/docs/dialect/quantifiers.rst @@ -7,7 +7,7 @@ specified ranges in a single query. .. contents:: :local: - :depth: 2 + :depth: 1 .. _any-quantifier: @@ -47,7 +47,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **ranges** A comma-separated list of genomic range literals. @@ -65,60 +65,52 @@ Examples Find variants in any of several regions of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY( - 'chr1:1000-2000', - 'chr1:5000-6000', - 'chr2:1000-3000' - ) - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000-2000', + 'chr1:5000-6000', + 'chr2:1000-3000' + ) **Check Against Gene Promoters:** Find features overlapping any of a set of promoter regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE interval INTERSECTS ANY( - 'chr1:11869-12869', -- Gene A promoter - 'chr1:29554-30554', -- Gene B promoter - 'chr1:69091-70091' -- Gene C promoter - ) - """) + SELECT * FROM peaks + WHERE interval INTERSECTS ANY( + 'chr1:11869-12869', -- Gene A promoter + 'chr1:29554-30554', -- Gene B promoter + 'chr1:69091-70091' -- Gene C promoter + ) **Combine with Other Filters:** Filter by multiple regions and additional criteria: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') - AND quality >= 30 - AND filter = 'PASS' - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + AND quality >= 30 + AND filter = 'PASS' **Multi-Chromosome Query:** Query across different chromosomes efficiently: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY( - 'chr1:100000-200000', - 'chr2:100000-200000', - 'chr3:100000-200000', - 'chrX:100000-200000' - ) - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY( + 'chr1:100000-200000', + 'chr2:100000-200000', + 'chr3:100000-200000', + 'chrX:100000-200000' + ) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -190,7 +182,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **ranges** A comma-separated list of genomic range literals. @@ -208,49 +200,43 @@ Examples Find genes that contain all specified SNP positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS ALL( - 'chr1:1500', - 'chr1:1600', - 'chr1:1700' - ) - """) + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) **Ensure Complete Coverage:** Find intervals that span a set of required positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval CONTAINS ALL( - 'chr1:10000', - 'chr1:15000', - 'chr1:20000' - ) - """) + SELECT * FROM features + WHERE interval CONTAINS ALL( + 'chr1:10000', + 'chr1:15000', + 'chr1:20000' + ) **Find Overlapping Regions:** Find features that overlap with all specified windows (useful for finding features in the intersection of multiple regions): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ALL( - 'chr1:1000-2000', - 'chr1:1500-2500' - ) - """) + SELECT * FROM features + WHERE interval INTERSECTS ALL( + 'chr1:1000-2000', + 'chr1:1500-2500' + ) - # This finds features that overlap BOTH ranges - # (i.e., features in the intersection: chr1:1500-2000) + -- This finds features that overlap BOTH ranges + -- (i.e., features in the intersection: chr1:1500-2000) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -285,22 +271,6 @@ Related - :ref:`ANY ` - Match any range (logical OR) - :ref:`CONTAINS ` - Base containment operator -Choosing Between ANY and ALL ----------------------------- - -Use **ANY** when you want to find features that match at least one of several criteria: - -.. code-block:: python - - # Find variants in gene A OR gene B OR gene C - WHERE interval INTERSECTS ANY('gene_a_region', 'gene_b_region', 'gene_c_region') - -Use **ALL** when you want to find features that satisfy all criteria simultaneously: - -.. code-block:: python - - # Find features that contain ALL of these positions - WHERE interval CONTAINS ALL('pos1', 'pos2', 'pos3') Common Patterns --------------- @@ -309,24 +279,20 @@ Common Patterns Find features that don't overlap any blacklisted region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE NOT interval INTERSECTS ANY( - 'chr1:1000000-2000000', -- Centromere - 'chr1:5000000-5500000' -- Known artifact region - ) - """) + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:1000000-2000000', -- Centromere + 'chr1:5000000-5500000' -- Known artifact region + ) **Combining ANY and ALL:** Complex queries can combine both quantifiers: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') diff --git a/docs/operators/spatial-operators.rst b/docs/dialect/spatial-operators.rst similarity index 75% rename from docs/operators/spatial-operators.rst rename to docs/dialect/spatial-operators.rst index 6b48001..fa1c7be 100644 --- a/docs/operators/spatial-operators.rst +++ b/docs/dialect/spatial-operators.rst @@ -1,5 +1,5 @@ -Spatial Relationship Operators -============================== +Spatial Relationships +===================== Spatial relationship operators test positional relationships between genomic ranges. These are the core operators for determining whether genomic intervals overlap, @@ -7,7 +7,7 @@ contain, or are contained within other intervals. .. contents:: :local: - :depth: 2 + :depth: 1 .. _intersects-operator: @@ -46,7 +46,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine via ``register_table_schema()``. + A genomic column from a registered table. **literal_range** A string literal specifying a genomic range in the format ``'chromosome:start-end'``. @@ -66,50 +66,42 @@ Examples Find all variants that overlap a specific genomic region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' **Column-to-Column Joins:** Find variants that overlap with any gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval **With WHERE Clause:** Find overlapping features with additional filtering: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND g.biotype = 'protein_coding' - """) + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' **Left Outer Join:** Find all variants, with gene information where available: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - LEFT JOIN genes g ON v.interval INTERSECTS g.interval - """) + SELECT v.*, g.name AS gene_name + FROM variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -134,7 +126,7 @@ Backend Compatibility Performance Notes ~~~~~~~~~~~~~~~~~ -- Create indexes on ``(chromosome, start_pos, end_pos)`` for better join performance +- Create indexes on ``(chrom, start, "end")`` for better join performance - When joining large tables, consider filtering by chromosome first - The generated SQL uses efficient range comparison predicates @@ -183,7 +175,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **literal_range** A string literal specifying a genomic point or range. @@ -203,36 +195,30 @@ Examples Find genes that contain a specific position: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS 'chr1:1500' - """) + SELECT * FROM genes + WHERE interval CONTAINS 'chr1:1500' **Range Containment:** Find large features that fully contain smaller features: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT g.name AS gene_name, e.name AS exon_name - FROM genes g - INNER JOIN exons e ON g.interval CONTAINS e.interval - """) + SELECT g.name AS gene_name, e.name AS exon_name + FROM genes g + INNER JOIN exons e ON g.interval CONTAINS e.interval **Filtering Fully Contained Variants:** Find variants that are completely within gene boundaries: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.* - FROM variants v - INNER JOIN genes g ON g.interval CONTAINS v.interval - """) + SELECT v.* + FROM variants v + INNER JOIN genes g ON g.interval CONTAINS v.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -295,7 +281,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **literal_range** A string literal specifying the containing range. @@ -315,24 +301,20 @@ Examples Find all features within a specific genomic window: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval WITHIN 'chr1:1000000-2000000' - """) + SELECT * FROM features + WHERE interval WITHIN 'chr1:1000000-2000000' **Find Nested Features:** Find exons that are completely within their parent gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT e.*, g.name AS gene_name - FROM exons e - INNER JOIN genes g ON e.interval WITHIN g.interval - """) + SELECT e.*, g.name AS gene_name + FROM exons e + INNER JOIN genes g ON e.interval WITHIN g.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/reference/syntax-reference.rst b/docs/dialect/syntax-reference.rst similarity index 78% rename from docs/reference/syntax-reference.rst rename to docs/dialect/syntax-reference.rst index 48cfb14..0082e26 100644 --- a/docs/reference/syntax-reference.rst +++ b/docs/dialect/syntax-reference.rst @@ -5,7 +5,7 @@ Quick reference for GIQL syntax and operators. .. contents:: :local: - :depth: 2 + :depth: 1 Genomic Range Literals ---------------------- @@ -238,7 +238,7 @@ Exclusion (NOT IN) SELECT a.* FROM table_a a LEFT JOIN table_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL + WHERE b.chrom IS NULL Count Overlaps ~~~~~~~~~~~~~~ @@ -248,7 +248,7 @@ Count Overlaps SELECT a.*, COUNT(b.name) AS overlap_count FROM table_a a LEFT JOIN table_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, ... + GROUP BY a.chrom, a.start, a."end", ... K-Nearest Neighbors ~~~~~~~~~~~~~~~~~~~ @@ -266,7 +266,7 @@ Clustering SELECT *, CLUSTER(interval) AS cluster_id FROM table - ORDER BY chromosome, start_pos + ORDER BY chrom, start Merging ~~~~~~~ @@ -275,55 +275,3 @@ Merging SELECT MERGE(interval), COUNT(*) AS count FROM table - -Engine Methods --------------- - -execute() -~~~~~~~~~ - -Execute a GIQL query and return a cursor. - -.. code-block:: python - - cursor = engine.execute("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") - -transpile() -~~~~~~~~~~~ - -Convert GIQL to SQL without executing. - -.. code-block:: python - - sql = engine.transpile("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") - -register_table_schema() -~~~~~~~~~~~~~~~~~~~~~~~ - -Register a table's schema for genomic operations. - -.. code-block:: python - - engine.register_table_schema( - "table_name", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - chromosome_column="chromosome", # optional, default: "chromosome" - start_column="start_pos", # optional, default: "start_pos" - end_column="end_pos", # optional, default: "end_pos" - ) - -load_csv() -~~~~~~~~~~ - -Load a CSV file into a table. - -.. code-block:: python - - engine.load_csv("table_name", "file.csv") - engine.load_csv("table_name", "file.tsv", delimiter="\t") diff --git a/docs/guides/engine.rst b/docs/guides/engine.rst new file mode 100644 index 0000000..71269be --- /dev/null +++ b/docs/guides/engine.rst @@ -0,0 +1,195 @@ +Execution engines +================= + +GIQL transpiles genomic queries to SQL that can be executed on any database +backend. This guide covers backend-specific considerations and tips. + +.. contents:: + :local: + :depth: 1 + +Supported Backends +------------------ + +GIQL generates SQL that works across database systems: + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Status + - Best For + * - DuckDB + - Full Support + - Analytics, large datasets, in-memory processing + * - SQLite + - Full Support + - Lightweight, embedded, portable databases + * - PostgreSQL + - Planned + - Production deployments, shared databases + +Using with DuckDB +----------------- + +DuckDB is recommended for most genomic analysis use cases. It provides excellent +performance for analytical queries and handles large genomic datasets efficiently. + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["features"], + ) + + conn = duckdb.connect() + conn.execute("CREATE TABLE features AS SELECT * FROM read_csv('features.bed', delim='\t')") + result = conn.execute(sql).fetchdf() + +**Advantages:** + +- Fast analytical query performance +- Efficient columnar storage +- Good support for large datasets +- Rich SQL feature set +- In-memory and persistent options + +Using with SQLite +----------------- + +SQLite is a lightweight, embedded database suitable for smaller datasets or +when portability is important. + +.. code-block:: python + + import sqlite3 + from giql import transpile + + sql = transpile( + """ + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["features"], + ) + + conn = sqlite3.connect("data.db") + cursor = conn.execute(sql) + for row in cursor: + print(row) + +**Advantages:** + +- Zero configuration +- Single-file database +- Widely compatible +- Small memory footprint + +Writing Portable Queries +------------------------ + +Query Compatibility +~~~~~~~~~~~~~~~~~~~ + +GIQL queries are portable across backends. The same GIQL query produces SQL +that works on any supported database: + +.. code-block:: python + + from giql import transpile + + query = """ + SELECT a.*, b.name AS gene + FROM variants a + JOIN genes b ON a.interval INTERSECTS b.interval + WHERE a.quality >= 30 + """ + + # Same GIQL query works for any backend + sql = transpile(query, tables=["variants", "genes"]) + +Backend-Specific Features +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some SQL features may only be available on certain backends: + +.. list-table:: + :header-rows: 1 + :widths: 40 20 20 20 + + * - Feature + - DuckDB + - SQLite + - Notes + * - Window functions + - Yes + - Yes + - Full support + * - CTEs (WITH clause) + - Yes + - Yes + - Full support + * - LATERAL joins + - Yes + - Limited + - Used by NEAREST + * - STRING_AGG + - Yes + - GROUP_CONCAT + - Different function names + +Performance Comparison +---------------------- + +Backend Performance Characteristics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 30 35 35 + + * - Operation + - DuckDB + - SQLite + * - Large table scans + - Excellent (columnar) + - Good + * - Complex joins + - Excellent + - Good + * - Aggregations + - Excellent + - Good + * - Small queries + - Good + - Excellent + * - Memory usage + - Higher + - Lower + * - Startup time + - Faster + - Fast + +Choosing the Right Backend +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Choose DuckDB when:** + +- Working with large datasets (millions of features) +- Running complex analytical queries +- Performing heavy aggregations +- Memory is not constrained + +**Choose SQLite when:** + +- Working with smaller datasets +- Need maximum portability +- Memory is constrained +- Simple query patterns diff --git a/docs/guides/index.rst b/docs/guides/index.rst index c3265be..b7644d1 100644 --- a/docs/guides/index.rst +++ b/docs/guides/index.rst @@ -6,27 +6,21 @@ and best practices for using GIQL effectively. .. toctree:: :maxdepth: 2 + :hidden: schema-mapping - multi-backend + engine performance - transpilation - -Guide Overview --------------- :doc:`schema-mapping` Learn how to configure GIQL to work with your genomic data, including - registering table schemas and mapping logical genomic columns. + table configuration and mapping logical genomic columns. -:doc:`multi-backend` - Understand GIQL's multi-database support and how to work with different - backends like DuckDB, SQLite, and PostgreSQL. +:doc:`engine` + Understand how to use GIQL's transpiled SQL with different + execution engines like DuckDB, SQLite, and PostgreSQL. :doc:`performance` Optimize your GIQL queries for better performance with indexing strategies, query patterns, and backend-specific tips. -:doc:`transpilation` - Understand how GIQL translates queries to SQL, debug query generation, - and integrate transpiled SQL with external tools. diff --git a/docs/guides/multi-backend.rst b/docs/guides/multi-backend.rst deleted file mode 100644 index ecc3799..0000000 --- a/docs/guides/multi-backend.rst +++ /dev/null @@ -1,367 +0,0 @@ -Multi-Backend Guide -=================== - -GIQL supports multiple database backends, allowing you to run the same genomic -queries against different database systems. This guide covers backend selection, -configuration, and backend-specific considerations. - -.. contents:: - :local: - :depth: 2 - -Supported Backends ------------------- - -GIQL currently supports the following database backends: - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Status - - Best For - * - DuckDB - - Full Support - - Analytics, large datasets, in-memory processing - * - SQLite - - Full Support - - Lightweight, embedded, portable databases - * - PostgreSQL - - Planned - - Production deployments, shared databases - -Selecting a Backend -------------------- - -DuckDB (Recommended) -~~~~~~~~~~~~~~~~~~~~ - -DuckDB is the recommended backend for most use cases. It provides excellent -performance for analytical queries and handles large genomic datasets efficiently. - -.. code-block:: python - - from giql import GIQLEngine - - # In-memory DuckDB (default) - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("features", "features.bed") - # ... register schemas and query - - # Persistent DuckDB database - with GIQLEngine(target_dialect="duckdb", db_path="my_data.duckdb") as engine: - # Data persists between sessions - pass - -**Advantages:** - -- Fast analytical query performance -- Efficient columnar storage -- Good support for large datasets -- Rich SQL feature set -- In-memory and persistent options - -**Best for:** - -- Interactive analysis -- Large BED/VCF files -- Complex aggregations -- One-time analysis pipelines - -SQLite -~~~~~~ - -SQLite is a lightweight, embedded database suitable for smaller datasets or -when portability is important. - -.. code-block:: python - - # In-memory SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - pass - - # Persistent SQLite database - with GIQLEngine(target_dialect="sqlite", db_path="my_data.db") as engine: - pass - -**Advantages:** - -- Zero configuration -- Single-file database -- Widely compatible -- Small memory footprint - -**Best for:** - -- Small to medium datasets -- Portable analysis -- Embedded applications -- Simple workflows - -Backend Configuration ---------------------- - -In-Memory vs Persistent -~~~~~~~~~~~~~~~~~~~~~~~ - -Both DuckDB and SQLite support in-memory and persistent modes: - -.. code-block:: python - - # In-memory (data lost when engine closes) - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("features", "features.bed") - # Data exists only during this session - - # Persistent (data saved to disk) - with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: - engine.load_csv("features", "features.bed") - # Data persists after engine closes - - # Reopen persistent database - with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: - # Previous data is available - cursor = engine.execute("SELECT * FROM features LIMIT 5") - -Connection Options -~~~~~~~~~~~~~~~~~~ - -Pass additional connection options to the underlying database: - -.. code-block:: python - - # DuckDB with custom settings - with GIQLEngine( - target_dialect="duckdb", - db_path="analysis.duckdb", - read_only=False, - ) as engine: - pass - -Writing Portable Queries ------------------------- - -Query Compatibility -~~~~~~~~~~~~~~~~~~~ - -GIQL queries are portable across backends. The same query works on any -supported database: - -.. code-block:: python - - query = """ - SELECT a.*, b.name AS gene - FROM variants a - JOIN genes b ON a.interval INTERSECTS b.interval - WHERE a.quality >= 30 - """ - - # Works on DuckDB - with GIQLEngine(target_dialect="duckdb") as engine: - # ... setup ... - cursor = engine.execute(query) - - # Same query works on SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - # ... setup ... - cursor = engine.execute(query) - -SQL Dialect Differences -~~~~~~~~~~~~~~~~~~~~~~~ - -While GIQL queries are portable, the generated SQL differs between backends. -Use ``transpile()`` to see the backend-specific SQL: - -.. code-block:: python - - query = "SELECT * FROM features WHERE interval INTERSECTS 'chr1:1000-2000'" - - # DuckDB SQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - print(engine.transpile(query)) - - # SQLite SQL (may differ slightly) - with GIQLEngine(target_dialect="sqlite") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - print(engine.transpile(query)) - -Backend-Specific Features -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some SQL features may only be available on certain backends: - -.. list-table:: - :header-rows: 1 - :widths: 40 20 20 20 - - * - Feature - - DuckDB - - SQLite - - Notes - * - Window functions - - Yes - - Yes - - Full support - * - CTEs (WITH clause) - - Yes - - Yes - - Full support - * - LATERAL joins - - Yes - - Limited - - Used by NEAREST - * - STRING_AGG - - Yes - - GROUP_CONCAT - - Different function names - -Migrating Between Backends --------------------------- - -Exporting Data -~~~~~~~~~~~~~~ - -Export data from one backend for import into another: - -.. code-block:: python - - # Export from DuckDB - with GIQLEngine(target_dialect="duckdb", db_path="source.duckdb") as engine: - cursor = engine.execute("SELECT * FROM features") - import pandas as pd - df = pd.DataFrame(cursor.fetchall(), - columns=[desc[0] for desc in cursor.description]) - df.to_csv("features_export.csv", index=False) - - # Import to SQLite - with GIQLEngine(target_dialect="sqlite", db_path="target.db") as engine: - engine.load_csv("features", "features_export.csv") - engine.register_table_schema("features", {...}, genomic_column="interval") - -Schema Compatibility -~~~~~~~~~~~~~~~~~~~~ - -Ensure schema definitions work across backends: - -.. code-block:: python - - # Use portable type names - schema = { - "chromosome": "VARCHAR", # Works on all backends - "start_pos": "BIGINT", # Maps to appropriate integer type - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", # Maps to appropriate float type - } - - # Same schema works on both backends - for dialect in ["duckdb", "sqlite"]: - with GIQLEngine(target_dialect=dialect) as engine: - engine.load_csv("features", "features.csv") - engine.register_table_schema("features", schema, genomic_column="interval") - -Performance Comparison ----------------------- - -Backend Performance Characteristics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 30 35 35 - - * - Operation - - DuckDB - - SQLite - * - Large table scans - - Excellent (columnar) - - Good - * - Complex joins - - Excellent - - Good - * - Aggregations - - Excellent - - Good - * - Small queries - - Good - - Excellent - * - Memory usage - - Higher - - Lower - * - Startup time - - Faster - - Fast - -Choosing the Right Backend -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Choose DuckDB when:** - -- Working with large datasets (millions of features) -- Running complex analytical queries -- Performing heavy aggregations -- Memory is not constrained - -**Choose SQLite when:** - -- Working with smaller datasets -- Need maximum portability -- Memory is constrained -- Simple query patterns - -Using External Connections --------------------------- - -Connecting to Existing Databases -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Connect to databases created outside of GIQL: - -.. code-block:: python - - # Connect to existing DuckDB database - with GIQLEngine(target_dialect="duckdb", db_path="existing.duckdb") as engine: - # Register schemas for existing tables - engine.register_table_schema( - "my_existing_table", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) - - # Query existing data with GIQL operators - cursor = engine.execute(""" - SELECT * FROM my_existing_table - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Using Transpiled SQL Externally -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Generate SQL for use with external database connections: - -.. code-block:: python - - import duckdb - - # Get transpiled SQL from GIQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - sql = engine.transpile(""" - SELECT * FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with external connection - conn = duckdb.connect("my_database.duckdb") - result = conn.execute(sql).fetchall() - conn.close() - -This is useful when integrating GIQL with existing database workflows or -when you need more control over the database connection. diff --git a/docs/guides/performance.rst b/docs/guides/performance.rst index c0c4e51..019416e 100644 --- a/docs/guides/performance.rst +++ b/docs/guides/performance.rst @@ -6,7 +6,7 @@ indexing, query patterns, and backend-specific optimizations. .. contents:: :local: - :depth: 2 + :depth: 1 Understanding Query Performance ------------------------------- @@ -14,11 +14,11 @@ Understanding Query Performance How GIQL Queries Execute ~~~~~~~~~~~~~~~~~~~~~~~~ -When you execute a GIQL query: +When you use GIQL: 1. GIQL parses the query and identifies genomic operators -2. Operators are expanded into standard SQL predicates -3. The SQL is sent to the database backend +2. Operators are expanded into SQL predicates +3. You execute the SQL on your database backend 4. The database executes the query using its optimizer Performance depends on both the generated SQL and how the database executes it. @@ -39,19 +39,11 @@ Creating Indexes Create indexes on genomic columns for faster queries: -.. code-block:: python - - # DuckDB - engine.conn.execute(""" - CREATE INDEX idx_features_position - ON features (chromosome, start_pos, end_pos) - """) +.. code-block:: sql - # SQLite - engine.conn.execute(""" - CREATE INDEX idx_features_position - ON features (chromosome, start_pos, end_pos) - """) + -- DuckDB or SQLite + CREATE INDEX idx_features_position + ON features (chrom, start, "end") Recommended Index Patterns ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,21 +52,21 @@ Recommended Index Patterns .. code-block:: sql - CREATE INDEX idx_table_position ON table_name (chromosome, start_pos, end_pos) + CREATE INDEX idx_table_position ON table_name (chrom, start, "end") **For join queries:** .. code-block:: sql -- Index both tables involved in joins - CREATE INDEX idx_variants_position ON variants (chromosome, start_pos, end_pos) - CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + CREATE INDEX idx_variants_position ON variants (chrom, start, "end") + CREATE INDEX idx_genes_position ON genes (chrom, start, "end") **For strand-specific queries:** .. code-block:: sql - CREATE INDEX idx_features_strand ON features (chromosome, strand, start_pos, end_pos) + CREATE INDEX idx_features_strand ON features (chrom, strand, start, "end") When to Create Indexes ~~~~~~~~~~~~~~~~~~~~~~ @@ -100,88 +92,55 @@ Pre-filter by Chromosome Always include chromosome filtering when joining tables: -.. code-block:: python +.. code-block:: sql - # Good: Explicit chromosome filter - cursor = engine.execute(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE a.chromosome = 'chr1' - """) - - # Also good: Cross-chromosome join with implicit filtering - # GIQL handles this, but explicit is clearer - cursor = engine.execute(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - AND a.chromosome = b.chromosome - """) + -- Good: Explicit chromosome filter + SELECT a.*, b.name + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE a.chrom = 'chr1' Use Selective Filters Early ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Apply selective filters before joins: -.. code-block:: python +.. code-block:: sql - # Good: Filter before joining - cursor = engine.execute(""" - WITH filtered_variants AS ( - SELECT * FROM variants - WHERE quality >= 30 AND filter = 'PASS' - ) - SELECT f.*, g.name - FROM filtered_variants f - JOIN genes g ON f.interval INTERSECTS g.interval - """) - - # Less efficient: Filter after joining - cursor = engine.execute(""" - SELECT v.*, g.name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 AND v.filter = 'PASS' - """) + -- Good: Filter before joining + WITH filtered_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ) + SELECT f.*, g.name + FROM filtered_variants f + JOIN genes g ON f.interval INTERSECTS g.interval Limit Result Sets ~~~~~~~~~~~~~~~~~ Use LIMIT for exploratory queries: -.. code-block:: python +.. code-block:: sql - # Good: Limit results during exploration - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - LIMIT 100 - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' + LIMIT 100 Use DISTINCT Wisely ~~~~~~~~~~~~~~~~~~~ DISTINCT can be expensive. Only use when necessary: -.. code-block:: python +.. code-block:: sql - # Only use DISTINCT when you actually need unique rows - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - """) - - # If you just need to check existence, use EXISTS instead - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - WHERE EXISTS ( - SELECT 1 FROM features_b b - WHERE a.interval INTERSECTS b.interval - ) - """) + -- If you just need to check existence, use EXISTS instead + SELECT a.* + FROM features_a a + WHERE EXISTS ( + SELECT 1 FROM features_b b + WHERE a.interval INTERSECTS b.interval + ) NEAREST Query Optimization -------------------------- @@ -193,35 +152,32 @@ The NEAREST operator can be expensive for large datasets. Optimize with: **1. Use max_distance to limit search space:** -.. code-block:: python +.. code-block:: sql - # Good: Constrained search - cursor = engine.execute(""" - SELECT peaks.name, nearest.name, nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 -- Only search within 100kb - ) AS nearest - """) + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 -- Only search within 100kb + ) AS nearest **2. Request only the k you need:** -.. code-block:: python +.. code-block:: sql - # Good: Request exactly what you need + -- Good: Request exactly what you need NEAREST(genes, reference=peaks.interval, k=3) - # Wasteful: Request more than needed + -- Wasteful: Request more than needed NEAREST(genes, reference=peaks.interval, k=100) **3. Index the target table:** .. code-block:: sql - CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + CREATE INDEX idx_genes_position ON genes (chrom, start, "end") Merge and Cluster Optimization ------------------------------ @@ -231,34 +187,28 @@ Efficient Clustering For large datasets, consider pre-sorting: -.. code-block:: python +.. code-block:: sql - # Pre-sort data for clustering - cursor = engine.execute(""" - WITH sorted AS ( - SELECT * FROM features - ORDER BY chromosome, start_pos - ) - SELECT *, CLUSTER(interval) AS cluster_id - FROM sorted - """) + WITH sorted AS ( + SELECT * FROM features + ORDER BY chrom, start + ) + SELECT *, CLUSTER(interval) AS cluster_id + FROM sorted Efficient Merging ~~~~~~~~~~~~~~~~~ Filter before merging to reduce data volume: -.. code-block:: python +.. code-block:: sql - # Good: Filter first, then merge - cursor = engine.execute(""" - WITH filtered AS ( - SELECT * FROM features - WHERE score >= 10 - ) - SELECT MERGE(interval), COUNT(*) AS count - FROM filtered - """) + WITH filtered AS ( + SELECT * FROM features + WHERE score >= 10 + ) + SELECT MERGE(interval), COUNT(*) AS count + FROM filtered Analyzing Query Performance --------------------------- @@ -266,43 +216,24 @@ Analyzing Query Performance Using EXPLAIN ~~~~~~~~~~~~~ -Analyze query execution plans: +Analyze query execution plans by running EXPLAIN on the transpiled SQL: .. code-block:: python - # Get the transpiled SQL - sql = engine.transpile(""" + from giql import transpile + + sql = transpile( + """ SELECT a.*, b.name FROM variants a JOIN genes b ON a.interval INTERSECTS b.interval - """) - - # Analyze the execution plan - cursor = engine.execute(f"EXPLAIN {sql}") - for row in cursor: - print(row) + """, + tables=["variants", "genes"], + ) + # Run EXPLAIN on your database connection + # conn.execute(f"EXPLAIN {sql}") # DuckDB also supports EXPLAIN ANALYZE for actual timing - cursor = engine.execute(f"EXPLAIN ANALYZE {sql}") - -Timing Queries -~~~~~~~~~~~~~~ - -Measure query execution time: - -.. code-block:: python - - import time - - start = time.time() - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - """) - results = cursor.fetchall() - elapsed = time.time() - start - - print(f"Query returned {len(results)} rows in {elapsed:.2f} seconds") Backend-Specific Tips --------------------- @@ -314,21 +245,12 @@ DuckDB Optimizations DuckDB is columnar, so queries that select few columns are faster: -.. code-block:: python - - # Faster: Select only needed columns - cursor = engine.execute(""" - SELECT chromosome, start_pos, end_pos, name - FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) +.. code-block:: sql - # Slower: Select all columns - cursor = engine.execute(""" - SELECT * - FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + -- Faster: Select only needed columns + SELECT chrom, start, "end", name + FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' **Parallel execution:** @@ -344,50 +266,13 @@ SQLite Optimizations -- Include commonly selected columns in the index CREATE INDEX idx_features_covering - ON features (chromosome, start_pos, end_pos, name, score) + ON features (chrom, start, "end", name, score) **Analyze tables:** -.. code-block:: python - - # Help SQLite's query planner - engine.conn.execute("ANALYZE features") - -Memory Management ------------------ - -Streaming Results -~~~~~~~~~~~~~~~~~ - -For large result sets, iterate instead of fetching all: - -.. code-block:: python - - # Good: Stream results - cursor = engine.execute("SELECT * FROM large_table") - for row in cursor: - process(row) - - # Memory-intensive: Fetch all at once - cursor = engine.execute("SELECT * FROM large_table") - all_rows = cursor.fetchall() # Loads everything into memory - -Batch Processing -~~~~~~~~~~~~~~~~ - -Process large datasets in batches: - -.. code-block:: python - - chromosomes = ['chr1', 'chr2', 'chr3', ...] # All chromosomes +.. code-block:: sql - for chrom in chromosomes: - cursor = engine.execute(f""" - SELECT * FROM features - WHERE chromosome = '{chrom}' - AND interval INTERSECTS '{chrom}:1-1000000' - """) - process_chromosome(cursor) + ANALYZE features Performance Checklist --------------------- @@ -396,13 +281,13 @@ Before running large queries, check: .. code-block:: text - □ Indexes created on genomic columns - □ Chromosome filtering included in joins - □ Selective filters applied early - □ LIMIT used for exploration - □ Only necessary columns selected - □ NEAREST queries use max_distance - □ Results streamed instead of fetched all at once + - Indexes created on genomic columns + - Chromosome filtering included in joins + - Selective filters applied early + - LIMIT used for exploration + - Only necessary columns selected + - NEAREST queries use max_distance + - Results streamed instead of fetched all at once Quick Wins ~~~~~~~~~~ diff --git a/docs/guides/quickstart.rst b/docs/guides/quickstart.rst new file mode 100644 index 0000000..ef7c3ae --- /dev/null +++ b/docs/guides/quickstart.rst @@ -0,0 +1,175 @@ +Quick Start +=========== + +GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing +you to express complex genomic range operations without writing intricate +SQL expressions. GIQL queries read naturally, making your analysis code +easier to review and share. GIQL operators follow established conventions +around genomic spatial relationships, so the semantics are familiar and +predictable. + +- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships +- **Distance operators**: DISTANCE, NEAREST for proximity queries +- **Aggregation operators**: CLUSTER, MERGE for combining intervals +- **Set quantifiers**: ANY, ALL for multi-range queries +- **Range parsing**: Understands genomic range strings and coordinate systems +- **Transpilation**: Converts GIQL to standard SQL-92 compatible output for execution on any backend + +Installation +------------ + +Install GIQL using pip: + +.. code-block:: bash + + pip install giql + +Basic Usage +----------- + +Table Configuration +~~~~~~~~~~~~~~~~~~~ + +GIQL works with genomic data stored in tables with separate columns for chromosome, +start position, and end position. The default column names are: + +* **chrom**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') +* **start**: Start position of the genomic interval (0-based, inclusive) +* **end**: End position of the genomic interval (0-based, exclusive, half-open) +* **strand** (optional): Strand orientation ('+', '-', or '.') + +If your table uses the default column names, you can pass just the table name +as a string. For custom column names, use a ``Table`` object: + +.. code-block:: python + + from giql import Table, transpile + + # Default column names (chrom, start, end, strand) + sql = transpile(query, tables=["peaks"]) + + # Custom column names + sql = transpile( + query, + tables=[ + Table( + "variants", + genomic_col="interval", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + ) + ], + ) + +After configuration, you can use the genomic pseudo-column (default: ``interval``) +in your GIQL queries, and the transpiler will automatically expand it to the +physical column comparisons. + +Query with DuckDB +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = duckdb.connect() + conn.execute("CREATE TABLE variants AS SELECT * FROM read_csv('variants.csv')") + df = conn.execute(sql).fetchdf() + +Query with SQLite +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import sqlite3 + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = sqlite3.connect("data.db") + cursor = conn.execute(sql) + for row in cursor: + print(row) + +Spatial Operators +----------------- + +INTERSECTS +~~~~~~~~~~ + +Check if genomic ranges overlap: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + +CONTAINS +~~~~~~~~ + +Check if a range contains a point or another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS 'chr1:1500' + +WITHIN +~~~~~~ + +Check if a range is within another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval WITHIN 'chr1:1000-5000' + +Set Quantifiers +--------------- + +ANY +~~~ + +Match any of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + +ALL +~~~ + +Match all of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') + +Column-to-Column Joins +---------------------- + +Join tables on genomic position: + +.. code-block:: sql + + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval diff --git a/docs/guides/schema-mapping.rst b/docs/guides/schema-mapping.rst index f515695..43c580c 100644 --- a/docs/guides/schema-mapping.rst +++ b/docs/guides/schema-mapping.rst @@ -1,20 +1,20 @@ -Schema Mapping Guide -==================== +Schema Mapping +============== This guide explains how to configure GIQL to work with your genomic data by -registering table schemas and mapping logical genomic columns. +defining table configurations that map logical genomic columns to physical columns. .. contents:: :local: - :depth: 2 + :depth: 1 Understanding Schema Mapping ---------------------------- GIQL needs to know how your genomic data is structured in order to translate -genomic operators into SQL. This is done through schema registration, which -maps a logical "genomic column" (used in your queries) to the physical columns -in your database tables. +genomic operators into SQL. This is done through ``Table`` objects, which +map a logical "genomic column" (used in your queries) to the physical columns +in your files, data frames, or database tables. The Core Concept ~~~~~~~~~~~~~~~~ @@ -30,188 +30,126 @@ Behind the scenes, GIQL expands this to actual column comparisons: .. code-block:: sql SELECT * FROM variants - WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 -Schema registration tells GIQL which physical columns (``chromosome``, -``start_pos``, ``end_pos``) correspond to the logical ``interval`` column. +The ``Table`` configuration tells GIQL which physical columns (``chrom``, +``start``, ``end``) correspond to the logical ``interval`` column. -Registering Table Schemas -------------------------- +Configuring Tables +------------------ -Basic Registration -~~~~~~~~~~~~~~~~~~ +Basic Configuration +~~~~~~~~~~~~~~~~~~~ -Register a table schema using ``register_table_schema()``: +For tables that use the default column names (``chrom``, ``start``, ``end``, +``strand``), pass the table name as a string: .. code-block:: python - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load data - engine.load_csv("variants", "variants.csv") - - # Register schema - engine.register_table_schema( - "variants", # Table name - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "quality": "FLOAT", - }, - genomic_column="interval", # Logical column name for queries - ) - - # Now you can use 'interval' in queries - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Required Columns -~~~~~~~~~~~~~~~~ - -For schema registration, your table must have columns that map to: + from giql import transpile -- **chromosome**: The chromosome/contig identifier (e.g., 'chr1', 'chrX') -- **start_pos**: The start position of the genomic interval (0-based, inclusive) -- **end_pos**: The end position of the genomic interval (0-based, exclusive) - -GIQL looks for these column names by default. If your columns have different -names, see :ref:`custom-column-names`. - -Optional Strand Column -~~~~~~~~~~~~~~~~~~~~~~ + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) -If your data includes strand information, include it in the schema: +Default Columns +~~~~~~~~~~~~~~~ -.. code-block:: python +GIQL uses these default column names: - engine.register_table_schema( - "features", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", # '+', '-', or '.' - "name": "VARCHAR", - }, - genomic_column="interval", - ) +- **chrom**: The chromosome/contig identifier (e.g., 'chr1', 'chrX') +- **start**: The start position of the genomic interval (0-based, inclusive) +- **end**: The end position of the genomic interval (0-based, exclusive) +- **strand**: Strand orientation ('+', '-', or '.'), optional -The strand column enables strand-specific operations in operators like -CLUSTER and NEAREST. +The default genomic pseudo-column name is ``interval``. .. _custom-column-names: Custom Column Names ~~~~~~~~~~~~~~~~~~~ -If your table uses different column names for genomic coordinates, specify -the mapping explicitly: +If your table uses different column names, create a ``Table`` object with +the mapping: .. code-block:: python - engine.register_table_schema( - "my_table", - { - "chrom": "VARCHAR", # Your chromosome column - "chromStart": "BIGINT", # Your start column (UCSC-style) - "chromEnd": "BIGINT", # Your end column - "name": "VARCHAR", - }, - genomic_column="interval", - chromosome_column="chrom", # Map to your column name - start_column="chromStart", # Map to your column name - end_column="chromEnd", # Map to your column name + from giql import Table, transpile + + sql = transpile( + """ + SELECT * FROM my_table + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=[ + Table( + "my_table", + chrom_col="chrom", # Your chromosome column + start_col="chromStart", # Your start column (UCSC-style) + end_col="chromEnd", # Your end column + ) + ], ) Multiple Tables --------------- -Register Multiple Tables -~~~~~~~~~~~~~~~~~~~~~~~~ +Configuring Multiple Tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Register all tables that will participate in genomic queries: +Pass all tables that participate in genomic queries: .. code-block:: python - with GIQLEngine(target_dialect="duckdb") as engine: - # Load data files - engine.load_csv("variants", "variants.bed") - engine.load_csv("genes", "genes.bed") - engine.load_csv("regulatory", "regulatory.bed") - - # Define common schema - bed_schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", - "strand": "VARCHAR", - } - - # Register each table - for table in ["variants", "genes", "regulatory"]: - engine.register_table_schema( - table, - bed_schema, - genomic_column="interval", - ) + from giql import transpile - # Now you can join tables using genomic operators - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - """) + # Tables with default column names + sql = transpile( + """ + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + """, + tables=["variants", "genes"], + ) Different Schemas Per Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tables can have different schemas and even different genomic column names: +Tables can have different column names and even different genomic column names. +Mix strings (for default columns) with ``Table`` objects (for custom columns): .. code-block:: python - # Variants table with VCF-style columns - engine.register_table_schema( - "variants", - { - "CHROM": "VARCHAR", - "POS": "BIGINT", - "END": "BIGINT", - "ID": "VARCHAR", - "QUAL": "FLOAT", - }, - genomic_column="var_interval", - chromosome_column="CHROM", - start_column="POS", - end_column="END", - ) - - # Genes table with BED-style columns - engine.register_table_schema( - "genes", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "gene_name": "VARCHAR", - "strand": "VARCHAR", - }, - genomic_column="gene_interval", - ) + from giql import Table, transpile - # Query using different genomic column names - cursor = engine.execute(""" + sql = transpile( + """ SELECT v.ID, g.gene_name FROM variants v JOIN genes g ON v.var_interval INTERSECTS g.gene_interval - """) + """, + tables=[ + # VCF-style columns + Table( + "variants", + genomic_col="var_interval", + chrom_col="CHROM", + start_col="POS", + end_col="END", + strand_col=None, + ), + # BED-style columns (defaults) + Table( + "genes", + genomic_col="gene_interval", + ), + ], + ) Coordinate Systems ------------------ @@ -219,7 +157,7 @@ Coordinate Systems Understanding BED Coordinates ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -GIQL uses the BED coordinate convention: +GIQL uses the BED coordinate convention by default: - **0-based start**: The first base of a chromosome is position 0 - **Half-open intervals**: Start is inclusive, end is exclusive @@ -227,35 +165,25 @@ GIQL uses the BED coordinate convention: Example: An interval ``chr1:100-200`` covers bases 100 through 199 (100 bases total). -Converting from 1-Based Coordinates -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Working with 1-Based Coordinates +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If your data uses 1-based coordinates (like VCF or GFF), convert when loading: +If your data uses 1-based coordinates (like VCF or GFF), configure the +``Table`` accordingly: .. code-block:: python - import pandas as pd - - # Load 1-based data - df = pd.read_csv("variants.vcf", sep="\t") - - # Convert to 0-based - df['start_pos'] = df['POS'] - 1 # Convert 1-based to 0-based - df['end_pos'] = df['POS'] # For SNPs, end = start + 1 + from giql import Table, transpile - # Load into engine - engine.conn.execute("CREATE TABLE variants AS SELECT * FROM df") - - # Register schema - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - # ... other columns - }, - genomic_column="interval", + sql = transpile( + query, + tables=[ + Table( + "variants", + coordinate_system="1based", + interval_type="closed", + ) + ], ) Working with Point Features @@ -267,144 +195,9 @@ For point features (like SNPs), create an interval of length 1: # For a SNP at position 1000 (1-based) # 0-based interval: [999, 1000) - start_pos = 999 - end_pos = 1000 - -Data Types ----------- - -Recommended Column Types -~~~~~~~~~~~~~~~~~~~~~~~~ - -For optimal performance, use appropriate data types: - -.. list-table:: - :header-rows: 1 - :widths: 25 25 50 - - * - Column - - Recommended Type - - Notes - * - chromosome - - VARCHAR - - String type for chromosome names - * - start_pos - - BIGINT - - 64-bit integer for large genomes - * - end_pos - - BIGINT - - 64-bit integer for large genomes - * - strand - - VARCHAR(1) or CHAR(1) - - Single character: '+', '-', '.' - * - score - - FLOAT or DOUBLE - - Numeric scores - * - name - - VARCHAR - - Feature identifiers - -Type Compatibility -~~~~~~~~~~~~~~~~~~ - -GIQL schemas use SQL type names. Common mappings: - -.. list-table:: - :header-rows: 1 - :widths: 30 35 35 - - * - GIQL Schema Type - - DuckDB Type - - SQLite Type - * - INTEGER - - INTEGER - - INTEGER - * - BIGINT - - BIGINT - - INTEGER - * - VARCHAR - - VARCHAR - - TEXT - * - FLOAT - - FLOAT - - REAL - * - DOUBLE - - DOUBLE - - REAL - -Loading Data ------------- - -From CSV Files -~~~~~~~~~~~~~~ - -Load CSV files directly: - -.. code-block:: python - - engine.load_csv("features", "features.csv") - - # With custom options - engine.load_csv( - "features", - "features.tsv", - delimiter="\t", - header=True, - ) - -From Pandas DataFrames -~~~~~~~~~~~~~~~~~~~~~~ - -Load data from pandas: - -.. code-block:: python - - import pandas as pd - - df = pd.read_csv("features.bed", sep="\t", header=None, - names=["chromosome", "start_pos", "end_pos", "name"]) - - # Register the DataFrame as a table - engine.conn.execute("CREATE TABLE features AS SELECT * FROM df") - - # Then register the schema - engine.register_table_schema( - "features", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) + start = 999 + end = 1000 -From Existing Database Tables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If tables already exist in your database, just register their schemas: - -.. code-block:: python - - # Connect to existing database - with GIQLEngine(target_dialect="duckdb", db_path="my_database.duckdb") as engine: - # Register schemas for existing tables - engine.register_table_schema( - "existing_table", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) - - # Query existing data - cursor = engine.execute(""" - SELECT * FROM existing_table - WHERE interval INTERSECTS 'chr1:1000-2000' - """) Troubleshooting --------------- @@ -414,32 +207,17 @@ Common Issues **"Unknown column" errors:** -- Ensure the table schema is registered before querying -- Check that the genomic column name in your query matches the registered name -- Verify column names in the schema match actual table columns +- Ensure the table is included in the ``tables`` parameter +- Check that the genomic column name in your query matches the configured name +- Verify column names in the ``Table`` object match actual table columns **Incorrect results:** - Verify your coordinate system (0-based vs 1-based) -- Check that start_pos < end_pos for all intervals +- Check that start < end for all intervals - Ensure chromosome names match between tables (e.g., 'chr1' vs '1') **Performance issues:** - See the :doc:`performance` guide for optimization tips -- Consider adding indexes on genomic columns - -Verifying Schema Registration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Check that schemas are registered correctly: - -.. code-block:: python - - # After registration, test with a simple query - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - print(sql) - # Should show expanded SQL with chromosome, start_pos, end_pos comparisons +- Consider adding indexes on genomic columns \ No newline at end of file diff --git a/docs/guides/transpilation.rst b/docs/guides/transpilation.rst deleted file mode 100644 index bd4c24a..0000000 --- a/docs/guides/transpilation.rst +++ /dev/null @@ -1,417 +0,0 @@ -Transpilation Guide -=================== - -GIQL works by transpiling genomic queries into standard SQL. This guide explains -how transpilation works, how to debug query generation, and how to use transpiled -SQL with external tools. - -.. contents:: - :local: - :depth: 2 - -How Transpilation Works ------------------------ - -The Transpilation Process -~~~~~~~~~~~~~~~~~~~~~~~~~ - -When you write a GIQL query: - -.. code-block:: sql - - SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000' - -GIQL performs these steps: - -1. **Parse**: Parse the SQL to identify GIQL-specific operators -2. **Expand**: Replace genomic operators with standard SQL predicates -3. **Generate**: Produce SQL for the target database dialect - -The result is standard SQL: - -.. code-block:: sql - - SELECT * FROM variants - WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Operator Expansion -~~~~~~~~~~~~~~~~~~ - -Each GIQL operator expands to specific SQL patterns: - -**INTERSECTS** expands to range overlap checks: - -.. code-block:: sql - - -- GIQL - a.interval INTERSECTS b.interval - - -- SQL (same chromosome, overlapping ranges) - a.chromosome = b.chromosome - AND a.start_pos < b.end_pos - AND a.end_pos > b.start_pos - -**CONTAINS** expands to containment checks: - -.. code-block:: sql - - -- GIQL - a.interval CONTAINS b.interval - - -- SQL - a.chromosome = b.chromosome - AND a.start_pos <= b.start_pos - AND a.end_pos >= b.end_pos - -**DISTANCE** expands to gap calculations: - -.. code-block:: sql - - -- GIQL - DISTANCE(a.interval, b.interval) - - -- SQL (simplified) - CASE - WHEN a.chromosome != b.chromosome THEN NULL - WHEN a.end_pos <= b.start_pos THEN b.start_pos - a.end_pos - WHEN b.end_pos <= a.start_pos THEN a.start_pos - b.end_pos - ELSE 0 - END - -Using the Transpile Method --------------------------- - -Basic Transpilation -~~~~~~~~~~~~~~~~~~~ - -Use ``transpile()`` to see generated SQL without executing: - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - print(sql) - # Output: SELECT * FROM variants - # WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Transpiling Complex Queries -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Transpilation works with all GIQL features: - -.. code-block:: python - - # Join query - sql = engine.transpile(""" - SELECT v.*, g.name AS gene_name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - """) - print(sql) - - # NEAREST query - sql = engine.transpile(""" - SELECT peaks.name, nearest.name, nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest - """) - print(sql) - - # Aggregation query - sql = engine.transpile(""" - SELECT MERGE(interval), COUNT(*) AS count - FROM features - """) - print(sql) - -Debugging with Transpilation ----------------------------- - -Understanding Query Expansion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use transpilation to understand what GIQL does: - -.. code-block:: python - - # See how ANY quantifier expands - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') - """) - print(sql) - # Shows the OR conditions for each range - - # See how join conditions expand - sql = engine.transpile(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - """) - print(sql) - # Shows the full range comparison predicates - -Verbose Mode -~~~~~~~~~~~~ - -Enable verbose mode for detailed transpilation information: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - engine.register_table_schema("variants", {...}, genomic_column="interval") - - # Transpilation will print detailed information - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execution also shows transpilation details - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Troubleshooting Transpilation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Query not expanding correctly:** - -.. code-block:: python - - # Check that schema is registered - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - if "interval INTERSECTS" in sql: - print("Schema not registered for 'variants' table") - -**Wrong column names in output:** - -.. code-block:: python - - # Verify column mapping - engine.register_table_schema( - "variants", - {...}, - genomic_column="interval", - chromosome_column="chrom", # Check these match your table - start_column="start", - end_column="end", - ) - -Comparing Dialects ------------------- - -Same Query, Different SQL -~~~~~~~~~~~~~~~~~~~~~~~~~ - -See how the same query translates for different backends: - -.. code-block:: python - - query = """ - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """ - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - } - - # DuckDB - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("variants", schema, genomic_column="interval") - print("DuckDB SQL:") - print(engine.transpile(query)) - print() - - # SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - engine.register_table_schema("variants", schema, genomic_column="interval") - print("SQLite SQL:") - print(engine.transpile(query)) - -Dialect-Specific Differences -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some queries may generate different SQL for different dialects: - -- String functions may use different names -- Type casting syntax may vary -- Window function support may differ - -GIQL handles these differences automatically, but understanding them helps -when debugging or integrating with external tools. - -Using Transpiled SQL Externally -------------------------------- - -With External Database Connections -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use transpiled SQL with your own database connections: - -.. code-block:: python - - import duckdb - - # Generate SQL using GIQL - with GIQLEngine(target_dialect="duckdb") as giql_engine: - giql_engine.register_table_schema("variants", {...}, genomic_column="interval") - sql = giql_engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with external connection - conn = duckdb.connect("my_database.duckdb") - result = conn.execute(sql).fetchall() - conn.close() - -With ORMs and Query Builders -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Integrate transpiled SQL with SQLAlchemy or other ORMs: - -.. code-block:: python - - from sqlalchemy import create_engine, text - - # Generate SQL - with GIQLEngine(target_dialect="duckdb") as giql_engine: - giql_engine.register_table_schema("variants", {...}, genomic_column="interval") - sql = giql_engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with SQLAlchemy - sa_engine = create_engine("duckdb:///my_database.duckdb") - with sa_engine.connect() as conn: - result = conn.execute(text(sql)) - for row in result: - print(row) - -Building SQL Pipelines -~~~~~~~~~~~~~~~~~~~~~~ - -Use transpilation in data pipelines: - -.. code-block:: python - - def build_intersection_query(table_a, table_b, region): - """Generate SQL for intersection query.""" - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema(table_a, {...}, genomic_column="interval") - engine.register_table_schema(table_b, {...}, genomic_column="interval") - - return engine.transpile(f""" - SELECT a.*, b.name - FROM {table_a} a - JOIN {table_b} b ON a.interval INTERSECTS b.interval - WHERE a.interval INTERSECTS '{region}' - """) - - # Use in pipeline - sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000") - # Execute sql with your preferred method - -Saving Queries -~~~~~~~~~~~~~~ - -Save transpiled SQL for documentation or reuse: - -.. code-block:: python - - # Generate and save SQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("variants", {...}, genomic_column="interval") - - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - with open("query.sql", "w") as f: - f.write(sql) - - # Later, execute saved SQL - with open("query.sql") as f: - sql = f.read() - - conn = duckdb.connect("database.duckdb") - result = conn.execute(sql).fetchall() - -Advanced Transpilation ----------------------- - -Parameterized Queries -~~~~~~~~~~~~~~~~~~~~~ - -Build queries with parameters: - -.. code-block:: python - - def query_region(engine, chrom, start, end): - """Query a parameterized region.""" - region = f"{chrom}:{start}-{end}" - return engine.execute(f""" - SELECT * FROM variants - WHERE interval INTERSECTS '{region}' - """) - - # Use with different regions - cursor = query_region(engine, "chr1", 1000000, 2000000) - cursor = query_region(engine, "chr2", 5000000, 6000000) - -Dynamic Query Building -~~~~~~~~~~~~~~~~~~~~~~ - -Build queries programmatically: - -.. code-block:: python - - def build_multi_table_query(tables, target_region): - """Build a query that unions results from multiple tables.""" - union_parts = [] - for table in tables: - union_parts.append(f""" - SELECT *, '{table}' AS source FROM {table} - WHERE interval INTERSECTS '{target_region}' - """) - - query = " UNION ALL ".join(union_parts) - return engine.transpile(query) - -Inspecting the AST -~~~~~~~~~~~~~~~~~~ - -For advanced debugging, you can inspect the parsed query: - -.. code-block:: python - - # GIQL uses sqlglot internally - # The transpiled SQL shows the final result - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - - # For deep debugging, examine the generated SQL structure - print(sql) diff --git a/docs/index.rst b/docs/index.rst index 9918a00..b595529 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,140 +1,49 @@ -GIQL - Genomic Interval Query Language +Genomic Interval Query Language (GIQL) ====================================== -**GIQL** is a SQL dialect for genomic range queries with multi-database support. +**GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations. -Genomic analysis often requires repetitive, complex SQL patterns to express simple operations like finding overlapping intervals or merging features. GIQL extends SQL with dedicated operators for these common tasks, so you can declaratively express *what* you want to compute without getting lost in SQL boilerplate. GIQL queries read naturally, even without SQL expertise - this clarity makes your analysis code easier to review and share. Best of all, GIQL queries work across DuckDB, SQLite, PostgreSQL, and other databases, so you're never locked into a specific engine and can choose the tool that fits your use case. Finally, GIQL operators follow established conventions from tools like bedtools, so the semantics are familiar and predictable. +Dialect +------- +GIQL extends the SQL query language with dedicated constructs for these +common tasks, allowing you to declare *what* you want to compute rather +than how. Whether you're filtering variants by genomic region, finding +overlapping features, or calculating distances between intervals, GIQL +makes these operations intuitive and portable. .. toctree:: - :maxdepth: 2 - :caption: Getting Started + :maxdepth: 1 + :caption: Dialect - quickstart + dialect/index + dialect/syntax-reference -.. toctree:: - :maxdepth: 2 - :caption: Operator Reference - - operators/index +Transpilation +------------- +The ``giql`` package *transpiles* queries written in GIQL to regular SQL +for use in existing database systems and analytics engines. .. toctree:: - :maxdepth: 2 - :caption: Guides + :maxdepth: 1 + :caption: Transpilation - guides/index + transpilation/index + transpilation/execution + transpilation/api-reference -.. toctree:: - :maxdepth: 2 - :caption: Recipes - recipes/index +Learn more +---------- +See the following guides to learn how to use GIQL effectively: .. toctree:: - :maxdepth: 2 - :caption: Reference - - reference/syntax-reference - api/index - -Quick Start ------------ - -Install GIQL: - -.. code-block:: bash - - pip install giql - -Basic usage: - -.. code-block:: python - - from giql import GIQLEngine - - # Create engine with DuckDB backend - with GIQLEngine(target_dialect="duckdb") as engine: - # Load genomic data - engine.load_csv("variants", "variants.csv") - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Query with genomic operators (returns cursor for streaming) - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Process results - for row in cursor: - print(row) - - # Or just transpile to SQL without executing - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - print(sql) # See the generated SQL - -Features --------- + :maxdepth: 1 + :caption: Guides and Recipes -* **SQL-based**: Familiar SQL syntax with genomic extensions -* **Multi-backend**: Works with DuckDB, SQLite, and more -* **Spatial operators**: INTERSECTS, CONTAINS, WITHIN, DISTANCE, NEAREST -* **Aggregation operators**: CLUSTER, MERGE for combining intervals -* **Set quantifiers**: ANY, ALL for multi-range queries -* **Column-to-column joins**: Join tables on genomic position -* **Transpilation**: Convert GIQL to standard SQL for debugging or external use - -Operators at a Glance ---------------------- - -**Spatial Relationships:** - -.. code-block:: sql - - -- Find overlapping features - WHERE interval INTERSECTS 'chr1:1000-2000' - - -- Find containing/contained features - WHERE gene.interval CONTAINS variant.interval - -**Distance and Proximity:** - -.. code-block:: sql - - -- Calculate distance between intervals - SELECT DISTANCE(a.interval, b.interval) AS dist - - -- Find k-nearest neighbors - FROM peaks CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) - -**Aggregation:** - -.. code-block:: sql - - -- Cluster overlapping intervals - SELECT *, CLUSTER(interval) AS cluster_id FROM features - - -- Merge overlapping intervals - SELECT MERGE(interval) FROM features - -**Set Quantifiers:** - -.. code-block:: sql - - -- Match any of multiple regions - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + guides/quickstart + guides/index + recipes/index -See :doc:`operators/index` for complete operator documentation. Indices and tables ================== diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index 9560c34..0000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,228 +0,0 @@ -Quick Start -=========== - -Installation ------------- - -Install GIQL using pip: - -.. code-block:: bash - - pip install giql - -Or with optional dependencies: - -.. code-block:: bash - - pip install giql[duckdb] # For DuckDB support - -Basic Usage ------------ - -Expected Schema -~~~~~~~~~~~~~~~ - -GIQL works with genomic data stored in tables with separate columns for chromosome, -start position, and end position. The typical schema includes: - -* **chromosome**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') -* **start_pos**: Start position of the genomic interval (0-based, inclusive) -* **end_pos**: End position of the genomic interval (0-based, exclusive, half-open) -* **strand** (optional): Strand orientation ('+', '-', or '.') - -You must register the table schema with GIQL, mapping the logical genomic column -(used in queries) to the physical columns in your table: - -.. code-block:: python - - engine.register_table_schema( - "table_name", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", # Optional - # ... other columns ... - }, - genomic_column="interval", # Logical name used in queries - ) - -After registration, you can use ``interval`` in your GIQL queries, and the engine -will automatically map it to the ``chromosome``, ``start_pos``, and ``end_pos`` -columns. - -Query with DuckDB -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load CSV file into database - engine.load_csv("variants", "variants.csv") - - # Register schema mapping - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Query using the logical 'interval' column (returns cursor for streaming) - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Process results lazily - for row in cursor: - print(row) - - # Or materialize to pandas DataFrame - import pandas as pd - cursor = engine.execute("SELECT ...") - df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description]) - -Query with SQLite -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine: - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Iterate results - for row in cursor: - print(row) - -Spatial Operators ------------------ - -INTERSECTS -~~~~~~~~~~ - -Check if genomic ranges overlap: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - -CONTAINS -~~~~~~~~ - -Check if a range contains a point or another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS 'chr1:1500' - -WITHIN -~~~~~~ - -Check if a range is within another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval WITHIN 'chr1:1000-5000' - -Set Quantifiers ---------------- - -ANY -~~~ - -Match any of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - -ALL -~~~ - -Match all of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') - -Column-to-Column Joins ----------------------- - -Join tables on genomic position: - -.. code-block:: sql - - SELECT v.*, g.name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - -Transpiling to SQL ------------------- - -The ``transpile()`` method converts GIQL queries to standard SQL without executing them. -This is useful for debugging, understanding the generated SQL, or integrating with external tools: - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Register table schema - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Transpile GIQL to SQL - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - print(sql) - # Output: SELECT * FROM variants WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Different target dialects generate different SQL: - -.. code-block:: python - - # DuckDB dialect - with GIQLEngine(target_dialect="duckdb") as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Generates DuckDB-optimized SQL - - # SQLite dialect - with GIQLEngine(target_dialect="sqlite") as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Generates SQLite-compatible SQL - -The transpiled SQL can be executed directly on your database or used with other tools. -Use ``verbose=True`` when creating the engine to see detailed transpilation information: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Prints detailed information about the transpilation process diff --git a/docs/recipes/advanced-queries.rst b/docs/recipes/advanced-queries.rst index 2aaf944..62147f6 100644 --- a/docs/recipes/advanced-queries.rst +++ b/docs/recipes/advanced-queries.rst @@ -16,16 +16,14 @@ Match Any of Multiple Regions Find features overlapping any of several regions of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY( - 'chr1:1000000-2000000', - 'chr1:5000000-6000000', - 'chr2:1000000-3000000' - ) - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000000-2000000', + 'chr1:5000000-6000000', + 'chr2:1000000-3000000' + ) **Use case:** Query multiple regions of interest in a single statement. @@ -34,16 +32,14 @@ Match All of Multiple Points Find features containing all specified positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS ALL( - 'chr1:1500', - 'chr1:1600', - 'chr1:1700' - ) - """) + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) **Use case:** Find genes spanning a set of SNP positions. @@ -52,16 +48,14 @@ Exclude Multiple Regions Find features that don't overlap any blacklisted region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE NOT interval INTERSECTS ANY( - 'chr1:120000000-125000000', -- Centromere region - 'chr1:140000000-142000000', -- Known artifact - 'chrM:1-16569' -- Mitochondrial - ) - """) + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:120000000-125000000', -- Centromere region + 'chr1:140000000-142000000', -- Known artifact + 'chrM:1-16569' -- Mitochondrial + ) **Use case:** Filter out features in problematic genomic regions. @@ -70,13 +64,11 @@ Combine ANY and ALL Complex multi-range logic: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') **Use case:** Find features matching complex spatial criteria. @@ -88,18 +80,16 @@ Multi-Attribute Filtering Combine spatial and attribute filters: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name, g.biotype - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND v.filter = 'PASS' - AND v.allele_frequency > 0.01 - AND g.biotype = 'protein_coding' - ORDER BY v.chromosome, v.start_pos - """) + SELECT v.*, g.name AS gene_name, g.biotype + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND v.filter = 'PASS' + AND v.allele_frequency > 0.01 + AND g.biotype = 'protein_coding' + ORDER BY v.chrom, v.start **Use case:** Extract high-quality variants in protein-coding genes. @@ -108,18 +98,16 @@ Target Gene Lists Filter to specific genes of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE g.name IN ( - 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS', - 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM' - ) - ORDER BY g.name, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ( + 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS', + 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM' + ) + ORDER BY g.name, v.start **Use case:** Extract variants in clinically actionable genes. @@ -128,22 +116,20 @@ Conditional Logic Apply different criteria based on feature type: -.. code-block:: python - - cursor = engine.execute(""" - SELECT v.*, g.name, g.biotype, - CASE - WHEN g.biotype = 'protein_coding' THEN 'coding' - WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA' - ELSE 'other' - END AS gene_category - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE CASE - WHEN g.biotype = 'protein_coding' THEN v.quality >= 30 - ELSE v.quality >= 20 - END - """) +.. code-block:: sql + + SELECT v.*, g.name, g.biotype, + CASE + WHEN g.biotype = 'protein_coding' THEN 'coding' + WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA' + ELSE 'other' + END AS gene_category + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE CASE + WHEN g.biotype = 'protein_coding' THEN v.quality >= 30 + ELSE v.quality >= 20 + END **Use case:** Apply different quality thresholds based on genomic context. @@ -155,19 +141,17 @@ Per-Chromosome Statistics Calculate summary statistics by chromosome: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.chromosome, - COUNT(DISTINCT a.name) AS total_features, - COUNT(b.name) AS total_overlaps, - COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome - ORDER BY a.chromosome - """) + SELECT + a.chrom, + COUNT(DISTINCT a.name) AS total_features, + COUNT(b.name) AS total_overlaps, + COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom + ORDER BY a.chrom **Use case:** Compare feature distribution across chromosomes. @@ -176,19 +160,17 @@ Overlap Statistics Calculate overlap metrics: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.chromosome, - COUNT(*) AS overlap_count, - AVG(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS avg_overlap_bp, - SUM(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS total_overlap_bp - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome - ORDER BY a.chromosome - """) + SELECT + a.chrom, + COUNT(*) AS overlap_count, + AVG(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS avg_overlap_bp, + SUM(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS total_overlap_bp + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom + ORDER BY a.chrom **Use case:** Quantify overlap patterns across the genome. @@ -197,19 +179,17 @@ Feature Size Distribution Analyze feature sizes by category: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - biotype, - COUNT(*) AS count, - AVG(end_pos - start_pos) AS avg_length, - MIN(end_pos - start_pos) AS min_length, - MAX(end_pos - start_pos) AS max_length - FROM genes - GROUP BY biotype - ORDER BY count DESC - """) + SELECT + biotype, + COUNT(*) AS count, + AVG(end - start) AS avg_length, + MIN(end - start) AS min_length, + MAX(end - start) AS max_length + FROM genes + GROUP BY biotype + ORDER BY count DESC **Use case:** Compare size distributions across feature types. @@ -221,14 +201,12 @@ Three-Way Intersection Find features overlapping in all three tables: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - INNER JOIN features_c c ON a.interval INTERSECTS c.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + INNER JOIN features_c c ON a.interval INTERSECTS c.interval **Use case:** Find consensus regions across multiple datasets. @@ -237,19 +215,17 @@ Hierarchical Annotations Join multiple annotation levels: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - v.name AS variant, - e.name AS exon, - t.name AS transcript, - g.name AS gene - FROM variants v - INNER JOIN exons e ON v.interval INTERSECTS e.interval - INNER JOIN transcripts t ON e.interval WITHIN t.interval - INNER JOIN genes g ON t.interval WITHIN g.interval - """) + SELECT + v.name AS variant, + e.name AS exon, + t.name AS transcript, + g.name AS gene + FROM variants v + INNER JOIN exons e ON v.interval INTERSECTS e.interval + INNER JOIN transcripts t ON e.interval WITHIN t.interval + INNER JOIN genes g ON t.interval WITHIN g.interval **Use case:** Build hierarchical annotations for variants. @@ -258,26 +234,24 @@ Union with Deduplication Combine features from multiple sources: -.. code-block:: python - - cursor = engine.execute(""" - WITH all_peaks AS ( - SELECT *, 'chip_seq' AS source FROM chip_peaks - UNION ALL - SELECT *, 'atac_seq' AS source FROM atac_peaks - UNION ALL - SELECT *, 'dnase_seq' AS source FROM dnase_peaks - ) - SELECT - chromosome, - start_pos, - end_pos, - STRING_AGG(DISTINCT source, ',') AS sources, - COUNT(DISTINCT source) AS source_count - FROM all_peaks - GROUP BY chromosome, start_pos, end_pos - HAVING COUNT(DISTINCT source) >= 2 - """) +.. code-block:: sql + + WITH all_peaks AS ( + SELECT *, 'chip_seq' AS source FROM chip_peaks + UNION ALL + SELECT *, 'atac_seq' AS source FROM atac_peaks + UNION ALL + SELECT *, 'dnase_seq' AS source FROM dnase_peaks + ) + SELECT + chrom, + start, + end, + STRING_AGG(DISTINCT source, ',') AS sources, + COUNT(DISTINCT source) AS source_count + FROM all_peaks + GROUP BY chrom, start, end + HAVING COUNT(DISTINCT source) >= 2 **Use case:** Find regulatory regions supported by multiple assays. @@ -289,15 +263,13 @@ Filtered Subquery Use subqueries to pre-filter data: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.* - FROM variants v - WHERE v.interval INTERSECTS ANY( - SELECT position FROM genes WHERE biotype = 'protein_coding' - ) - """) + SELECT v.* + FROM variants v + WHERE v.interval INTERSECTS ANY( + SELECT position FROM genes WHERE biotype = 'protein_coding' + ) **Use case:** Intersect with dynamically filtered reference data. @@ -310,35 +282,33 @@ Chained CTEs Build complex analyses with Common Table Expressions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH - -- Step 1: Find high-quality variants - hq_variants AS ( - SELECT * FROM variants - WHERE quality >= 30 AND filter = 'PASS' - ), - -- Step 2: Annotate with genes - annotated AS ( - SELECT v.*, g.name AS gene_name, g.biotype - FROM hq_variants v - LEFT JOIN genes g ON v.interval INTERSECTS g.interval - ), - -- Step 3: Summarize by gene - gene_summary AS ( - SELECT - gene_name, - biotype, - COUNT(*) AS variant_count - FROM annotated - WHERE gene_name IS NOT NULL - GROUP BY gene_name, biotype - ) - SELECT * FROM gene_summary - ORDER BY variant_count DESC - LIMIT 20 - """) + WITH + -- Step 1: Find high-quality variants + hq_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ), + -- Step 2: Annotate with genes + annotated AS ( + SELECT v.*, g.name AS gene_name, g.biotype + FROM hq_variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval + ), + -- Step 3: Summarize by gene + gene_summary AS ( + SELECT + gene_name, + biotype, + COUNT(*) AS variant_count + FROM annotated + WHERE gene_name IS NOT NULL + GROUP BY gene_name, biotype + ) + SELECT * FROM gene_summary + ORDER BY variant_count DESC + LIMIT 20 **Use case:** Build multi-step analysis pipelines in a single query. @@ -350,22 +320,20 @@ Rank Overlaps Rank features by their overlap characteristics: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name, - a.chromosome, - a.start_pos, - overlap_count, - RANK() OVER (ORDER BY overlap_count DESC) AS rank - FROM ( - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - ) a - """) + SELECT + a.name, + a.chrom, + a.start, + overlap_count, + RANK() OVER (ORDER BY overlap_count DESC) AS rank + FROM ( + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand + ) a **Use case:** Identify features with the most overlaps. @@ -374,21 +342,19 @@ Running Totals Calculate cumulative coverage: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - chromosome, - start_pos, - end_pos, - end_pos - start_pos AS length, - SUM(end_pos - start_pos) OVER ( - PARTITION BY chromosome - ORDER BY start_pos - ) AS cumulative_bp - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + chrom, + start, + end, + end - start AS length, + SUM(end - start) OVER ( + PARTITION BY chrom + ORDER BY start + ) AS cumulative_bp + FROM features + ORDER BY chrom, start **Use case:** Track cumulative coverage along each chromosome. @@ -398,35 +364,20 @@ Debugging and Optimization View Generated SQL ~~~~~~~~~~~~~~~~~~ -Use transpile() to see the SQL GIQL generates: +Use ``transpile()`` to see the SQL GIQL generates: .. code-block:: python - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + from giql import transpile + + sql = transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["variants"], + ) print(sql) - # See the actual SQL that will be executed **Use case:** Debug queries or understand GIQL's translation. -Verbose Mode -~~~~~~~~~~~~ - -Enable detailed logging: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - # All queries will print transpilation details - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -**Use case:** Diagnose query translation issues. - Explain Query Plan ~~~~~~~~~~~~~~~~~~ @@ -434,16 +385,18 @@ Analyze query execution: .. code-block:: python - # First transpile to get the SQL - sql = engine.transpile(""" + from giql import transpile + + sql = transpile( + """ SELECT v.*, g.name FROM variants v JOIN genes g ON v.interval INTERSECTS g.interval - """) + """, + tables=["variants", "genes"], + ) # Then use database-native EXPLAIN - cursor = engine.execute(f"EXPLAIN {sql}") - for row in cursor: - print(row) + # e.g., conn.execute(f"EXPLAIN {sql}") **Use case:** Optimize slow queries by examining execution plans. diff --git a/docs/recipes/bedtools-migration.rst b/docs/recipes/bedtools-migration.rst index 74c27bd..4a00011 100644 --- a/docs/recipes/bedtools-migration.rst +++ b/docs/recipes/bedtools-migration.rst @@ -20,19 +20,19 @@ Quick Reference Table - GIQL Equivalent - Recipe * - ``intersect -a A -b B`` - - ``SELECT DISTINCT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT DISTINCT a.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-basic` * - ``intersect -a A -b B -wa`` - - ``SELECT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT a.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wa` * - ``intersect -a A -b B -wb`` - - ``SELECT b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT b.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wb` * - ``intersect -a A -b B -wa -wb`` - - ``SELECT a.*, b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT a.*, b.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wawb` * - ``intersect -a A -b B -v`` - - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chr IS NULL`` + - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chrom IS NULL`` - :ref:`intersect-v` * - ``intersect -a A -b B -u`` - ``SELECT DISTINCT a.* FROM a JOIN b ...`` @@ -47,10 +47,10 @@ Quick Reference Table - ``SELECT a.*, b.* FROM a LEFT JOIN b ...`` - :ref:`intersect-loj` * - ``closest -a A -b B -k N`` - - ``CROSS JOIN LATERAL NEAREST(b, reference=a.pos, k=N)`` + - ``CROSS JOIN LATERAL NEAREST(b, reference=a.interval, k=N)`` - :ref:`closest-k` * - ``closest -a A -b B -d`` - - ``SELECT ..., DISTANCE(a.pos, b.pos) ...`` + - ``SELECT ..., DISTANCE(a.interval, b.interval) ...`` - :ref:`closest-d` * - ``cluster -i A`` - ``SELECT *, CLUSTER(interval) AS cluster_id FROM a`` @@ -84,13 +84,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wa: @@ -105,13 +103,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wb: @@ -126,13 +122,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wawb: @@ -147,13 +141,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-v: @@ -168,14 +160,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL - """) + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chrom IS NULL .. _intersect-u: @@ -190,13 +180,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval .. _intersect-c: @@ -211,14 +199,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - """) + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand .. _intersect-wo: @@ -233,16 +219,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.*, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.*, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wao: @@ -257,19 +241,17 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.*, - CASE - WHEN b.chromosome IS NULL THEN 0 - ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - END AS overlap_bp - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.*, + CASE + WHEN b.chrom IS NULL THEN 0 + ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval .. _intersect-loj: @@ -284,13 +266,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval ``-s``: Same strand overlaps only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,14 +283,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand ``-S``: Opposite strand overlaps only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -323,16 +301,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand != b.strand - AND a.strand IN ('+', '-') - AND b.strand IN ('+', '-') - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') ``-f``: Minimum overlap fraction of A ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -345,16 +321,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (a.end_pos - a.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (a.end - a.start) ``-r``: Reciprocal overlap ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -367,23 +341,21 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python - - cursor = engine.execute(""" - WITH overlap_calcs AS ( - SELECT - a.*, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, - (a.end_pos - a.start_pos) AS a_length, - (b.end_pos - b.start_pos) AS b_length - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - ) - SELECT chromosome, start_pos, end_pos, name, score, strand - FROM overlap_calcs - WHERE overlap_bp >= 0.5 * a_length - AND overlap_bp >= 0.5 * b_length - """) +.. code-block:: sql + + WITH overlap_calcs AS ( + SELECT + a.*, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp, + (a.end - a.start) AS a_length, + (b.end - b.start) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT chrom, start, end, name, score, strand + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length bedtools closest ---------------- @@ -401,17 +373,15 @@ bedtools closest **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance .. _closest-d: @@ -426,31 +396,27 @@ bedtools closest **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name AS peak, - b.name AS gene, - DISTANCE(a.interval, b.interval) AS distance - FROM peaks a - CROSS JOIN genes b - WHERE a.chromosome = b.chromosome - ORDER BY a.name, distance - """) + SELECT + a.name AS peak, + b.name AS gene, + DISTANCE(a.interval, b.interval) AS distance + FROM peaks a + CROSS JOIN genes b + WHERE a.chrom = b.chrom + ORDER BY a.name, distance Or using NEAREST for just the closest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest ``-s``: Same strand only ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -463,22 +429,20 @@ Or using NEAREST for just the closest: **GIQL:** -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance bedtools cluster ---------------- @@ -496,15 +460,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start .. _cluster-d: @@ -519,15 +481,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start ``-s``: Strand-specific clustering ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -540,15 +500,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start bedtools merge -------------- @@ -566,12 +524,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features .. _merge-d: @@ -586,12 +542,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features ``-s``: Strand-specific merge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -604,12 +558,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features .. _merge-count: @@ -624,14 +576,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features ``-c -o mean``: Average score ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -644,14 +594,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - AVG(score) AS avg_score - FROM features - """) + SELECT + MERGE(interval), + AVG(score) AS avg_score + FROM features ``-c -o collapse``: Collect names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -664,14 +612,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS feature_names - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features Key Differences from Bedtools ----------------------------- diff --git a/docs/recipes/clustering-queries.rst b/docs/recipes/clustering-queries.rst index 6ff1487..3dbd682 100644 --- a/docs/recipes/clustering-queries.rst +++ b/docs/recipes/clustering-queries.rst @@ -16,15 +16,13 @@ Assign Cluster IDs Assign unique cluster IDs to groups of overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start **Use case:** Group overlapping peaks or annotations for downstream analysis. @@ -33,21 +31,19 @@ View Cluster Assignments See which features belong to which cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - cluster_id, - chromosome, - name, - start_pos, - end_pos - FROM ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - ORDER BY cluster_id, start_pos - """) + SELECT + cluster_id, + chrom, + name, + start, + end + FROM ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + ORDER BY cluster_id, start **Use case:** Inspect clustering results to understand feature groupings. @@ -59,15 +55,13 @@ Cluster with Gap Tolerance Cluster intervals that are within a specified distance of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start **Use case:** Group nearby features even if they don't directly overlap (e.g., cluster peaks within 1kb of each other). @@ -77,22 +71,16 @@ Variable Distance Thresholds Experiment with different clustering distances: -.. code-block:: python +.. code-block:: sql - # Tight clustering (overlapping only) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features - """) + -- Tight clustering (overlapping only) + SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features - # Medium clustering (within 500bp) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features - """) + -- Medium clustering (within 500bp) + SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features - # Loose clustering (within 5kb) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features - """) + -- Loose clustering (within 5kb) + SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features **Use case:** Compare clustering at different resolutions for sensitivity analysis. @@ -104,15 +92,13 @@ Cluster by Strand Cluster intervals separately for each strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Use case:** Maintain strand separation when clustering transcripts or strand-specific regulatory elements. @@ -122,15 +108,13 @@ Strand-Specific with Distance Combine strand awareness with distance tolerance: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Use case:** Cluster nearby same-strand features while keeping opposite strands separate. @@ -143,23 +127,21 @@ Count Features per Cluster Calculate how many features are in each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - chromosome, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end - FROM clustered - GROUP BY cluster_id, chromosome - ORDER BY chromosome, cluster_start - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chrom, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end + FROM clustered + GROUP BY cluster_id, chrom + ORDER BY chrom, cluster_start **Use case:** Identify cluster sizes and boundaries. @@ -168,24 +150,22 @@ Filter by Cluster Size Find clusters with a minimum number of features: -.. code-block:: python - - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ), - cluster_sizes AS ( - SELECT cluster_id, COUNT(*) AS size - FROM clustered - GROUP BY cluster_id - ) - SELECT c.* - FROM clustered c - JOIN cluster_sizes s ON c.cluster_id = s.cluster_id - WHERE s.size >= 3 - ORDER BY c.cluster_id, c.start_pos - """) +.. code-block:: sql + + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 + ORDER BY c.cluster_id, c.start **Use case:** Focus on regions with multiple overlapping features (hotspots). @@ -194,26 +174,24 @@ Cluster Summary Statistics Calculate statistics for each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - chromosome, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end, - MAX(end_pos) - MIN(start_pos) AS cluster_span, - AVG(score) AS avg_score, - MAX(score) AS max_score - FROM clustered - GROUP BY cluster_id, chromosome - ORDER BY feature_count DESC - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chrom, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end, + MAX(end) - MIN(start) AS cluster_span, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM clustered + GROUP BY cluster_id, chrom + ORDER BY feature_count DESC **Use case:** Rank clusters by size, span, or aggregate scores. @@ -225,12 +203,10 @@ Merge Overlapping Intervals Combine overlapping intervals into unified regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features **Use case:** Create non-overlapping consensus regions from redundant annotations. @@ -239,12 +215,10 @@ Merge with Distance Merge intervals within a specified distance: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features **Use case:** Create broader regions by joining nearby features. @@ -253,12 +227,10 @@ Strand-Specific Merge Merge intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features **Use case:** Create strand-aware consensus regions. @@ -270,14 +242,12 @@ Count Merged Features Track how many features were merged into each region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features **Use case:** Understand the complexity of each merged region. @@ -286,17 +256,15 @@ Aggregate Scores Calculate statistics for merged regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count, - AVG(score) AS avg_score, - MAX(score) AS max_score, - SUM(score) AS total_score - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score, + SUM(score) AS total_score + FROM features **Use case:** Summarize signal intensity across merged regions. @@ -305,14 +273,12 @@ Collect Feature Names List the names of features that were merged: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS merged_features - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS merged_features + FROM features **Use case:** Track provenance of merged regions. @@ -324,16 +290,14 @@ Total Base Pair Coverage Calculate total genomic coverage after merging: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) - FROM features - ) - SELECT SUM(end_pos - start_pos) AS total_coverage_bp - FROM merged - """) + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT SUM(end - start) AS total_coverage_bp + FROM merged **Use case:** Calculate the total genome fraction covered by features. @@ -342,21 +306,19 @@ Coverage per Chromosome Calculate coverage for each chromosome: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) - FROM features - ) - SELECT - chromosome, - COUNT(*) AS region_count, - SUM(end_pos - start_pos) AS coverage_bp - FROM merged - GROUP BY chromosome - ORDER BY chromosome - """) + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT + chrom, + COUNT(*) AS region_count, + SUM(end - start) AS coverage_bp + FROM merged + GROUP BY chrom + ORDER BY chrom **Use case:** Compare feature density across chromosomes. @@ -365,29 +327,27 @@ Coverage Reduction Compare raw vs merged coverage: -.. code-block:: python - - cursor = engine.execute(""" - WITH raw_stats AS ( - SELECT - COUNT(*) AS raw_count, - SUM(end_pos - start_pos) AS raw_bp - FROM features - ), - merged_stats AS ( - SELECT - COUNT(*) AS merged_count, - SUM(end_pos - start_pos) AS merged_bp - FROM (SELECT MERGE(interval) FROM features) - ) +.. code-block:: sql + + WITH raw_stats AS ( + SELECT + COUNT(*) AS raw_count, + SUM(end - start) AS raw_bp + FROM features + ), + merged_stats AS ( SELECT - raw_count, - merged_count, - raw_bp, - merged_bp, - ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct - FROM raw_stats, merged_stats - """) + COUNT(*) AS merged_count, + SUM(end - start) AS merged_bp + FROM (SELECT MERGE(interval) FROM features) + ) + SELECT + raw_count, + merged_count, + raw_bp, + merged_bp, + ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct + FROM raw_stats, merged_stats **Use case:** Quantify the redundancy in your feature set. @@ -399,24 +359,22 @@ Cluster Then Merge First cluster features, then analyze each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - MIN(chromosome) AS chromosome, - MIN(start_pos) AS start_pos, - MAX(end_pos) AS end_pos, - COUNT(*) AS feature_count, - STRING_AGG(name, ',') AS features - FROM clustered - GROUP BY cluster_id - ORDER BY chromosome, start_pos - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + MIN(chrom) AS chrom, + MIN(start) AS start, + MAX(end) AS end, + COUNT(*) AS feature_count, + STRING_AGG(name, ',') AS features + FROM clustered + GROUP BY cluster_id + ORDER BY chrom, start **Use case:** Alternative to MERGE that preserves cluster identifiers. @@ -425,26 +383,24 @@ Hierarchical Clustering Apply multiple clustering levels: -.. code-block:: python - - cursor = engine.execute(""" - WITH level1 AS ( - SELECT *, CLUSTER(interval, 0) AS cluster_l1 - FROM features - ), - level2 AS ( - SELECT *, CLUSTER(interval, 1000) AS cluster_l2 - FROM level1 - ) - SELECT - cluster_l1, - cluster_l2, - chromosome, - name, - start_pos, - end_pos - FROM level2 - ORDER BY cluster_l2, cluster_l1, start_pos - """) +.. code-block:: sql + + WITH level1 AS ( + SELECT *, CLUSTER(interval, 0) AS cluster_l1 + FROM features + ), + level2 AS ( + SELECT *, CLUSTER(interval, 1000) AS cluster_l2 + FROM level1 + ) + SELECT + cluster_l1, + cluster_l2, + chrom, + name, + start, + end + FROM level2 + ORDER BY cluster_l2, cluster_l1, start **Use case:** Analyze feature relationships at multiple scales. diff --git a/docs/recipes/distance-queries.rst b/docs/recipes/distance-queries.rst index 41f9ede..c71a4ee 100644 --- a/docs/recipes/distance-queries.rst +++ b/docs/recipes/distance-queries.rst @@ -16,24 +16,22 @@ Distance Between Feature Pairs Calculate the distance between features in two tables: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name AS feature_a, - b.name AS feature_b, - DISTANCE(a.interval, b.interval) AS distance - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - ORDER BY a.name, distance - """) + SELECT + a.name AS feature_a, + b.name AS feature_b, + DISTANCE(a.interval, b.interval) AS distance + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + ORDER BY a.name, distance **Use case:** Generate a distance matrix between regulatory elements and genes. .. note:: - Always include ``WHERE a.chromosome = b.chromosome`` to avoid comparing + Always include ``WHERE a.chrom = b.chrom`` to avoid comparing features on different chromosomes (which returns NULL for distance). Identify Overlapping vs Proximal @@ -41,23 +39,21 @@ Identify Overlapping vs Proximal Classify relationships based on distance: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - p.name AS peak, - g.name AS gene, - DISTANCE(p.interval, g.interval) AS dist, - CASE - WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' - WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)' - WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)' - ELSE 'distant' - END AS relationship - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - """) +.. code-block:: sql + + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS dist, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)' + WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom **Use case:** Categorize peak-gene relationships for enhancer analysis. @@ -66,19 +62,17 @@ Filter by Maximum Distance Find feature pairs within a distance threshold: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name, - b.name, - DISTANCE(a.interval, b.interval) AS dist - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - AND DISTANCE(a.interval, b.interval) <= 50000 - ORDER BY dist - """) + SELECT + a.name, + b.name, + DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + AND DISTANCE(a.interval, b.interval) <= 50000 + ORDER BY dist **Use case:** Find regulatory elements within 50kb of genes. @@ -90,17 +84,15 @@ Find K Nearest Features For each peak, find the 3 nearest genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Annotate ChIP-seq peaks with nearby genes. @@ -109,13 +101,11 @@ Nearest Feature to a Specific Location Find the 5 nearest genes to a specific genomic coordinate: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT name, distance - FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) - ORDER BY distance - """) + SELECT name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance **Use case:** Explore the genomic neighborhood of a position of interest. @@ -124,22 +114,20 @@ Nearest with Distance Constraint Find nearest features within a maximum distance: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Find regulatory targets within 100kb, ignoring distant genes. @@ -151,23 +139,21 @@ Same-Strand Nearest Neighbors Find nearest features on the same strand only: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.strand, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Find same-strand genes for strand-specific regulatory analysis. @@ -179,23 +165,21 @@ Upstream Features Find features upstream (5') of reference positions using signed distances: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance < 0 - ORDER BY peaks.name, nearest.distance DESC - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC **Use case:** Find genes upstream of regulatory elements. @@ -209,23 +193,21 @@ Downstream Features Find features downstream (3') of reference positions: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance > 0 - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance **Use case:** Identify downstream targets of promoter elements. @@ -234,23 +216,21 @@ Promoter-Proximal Analysis Find features within a specific distance window around the reference: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -2000 AND 500 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -2000 AND 500 + ORDER BY peaks.name, ABS(nearest.distance) **Use case:** Find genes with peaks in their promoter regions (-2kb to +500bp from TSS). @@ -262,25 +242,23 @@ Strand-Specific with Distance Constraint Find nearby same-strand features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=50000, - stranded=true, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -10000 AND 10000 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) **Use case:** Find same-strand genes within ±10kb for promoter-enhancer analysis. @@ -292,23 +270,21 @@ Average Distance to Nearest Gene Calculate the average distance from peaks to their nearest gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH nearest_genes AS ( - SELECT - peaks.name AS peak, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - ) + WITH nearest_genes AS ( SELECT - COUNT(*) AS peak_count, - AVG(distance) AS avg_distance, - MIN(distance) AS min_distance, - MAX(distance) AS max_distance - FROM nearest_genes - """) + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance, + MIN(distance) AS min_distance, + MAX(distance) AS max_distance + FROM nearest_genes **Use case:** Characterize the genomic distribution of peaks relative to genes. @@ -317,25 +293,23 @@ Distance Distribution by Chromosome Analyze distance patterns per chromosome: -.. code-block:: python - - cursor = engine.execute(""" - WITH nearest_genes AS ( - SELECT - peaks.chromosome, - peaks.name AS peak, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - ) +.. code-block:: sql + + WITH nearest_genes AS ( SELECT - chromosome, - COUNT(*) AS peak_count, - AVG(distance) AS avg_distance - FROM nearest_genes - GROUP BY chromosome - ORDER BY chromosome - """) + peaks.chrom, + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + chrom, + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance + FROM nearest_genes + GROUP BY chrom + ORDER BY chrom **Use case:** Compare regulatory element distribution across chromosomes. @@ -347,26 +321,24 @@ Expand Search Window Find features within an expanded window around each feature: -.. code-block:: python - - cursor = engine.execute(""" - WITH expanded AS ( - SELECT - name, - chromosome, - start_pos - 5000 AS search_start, - end_pos + 5000 AS search_end - FROM peaks - ) +.. code-block:: sql + + WITH expanded AS ( SELECT - e.name AS peak, - b.* - FROM expanded e - JOIN features_b b - ON b.chromosome = e.chromosome - AND b.start_pos < e.search_end - AND b.end_pos > e.search_start - """) + name, + chrom, + start - 5000 AS search_start, + end + 5000 AS search_end + FROM peaks + ) + SELECT + e.name AS peak, + b.* + FROM expanded e + JOIN features_b b + ON b.chrom = e.chrom + AND b.start < e.search_end + AND b.end > e.search_start **Use case:** Find all features within 5kb flanking regions. diff --git a/docs/recipes/index.rst b/docs/recipes/index.rst index f5d7a2c..5597846 100644 --- a/docs/recipes/index.rst +++ b/docs/recipes/index.rst @@ -11,34 +11,21 @@ using GIQL. Each recipe focuses on a specific use case with ready-to-use query p Getting Started with Recipes ---------------------------- -All recipes assume you have set up a GIQL engine and registered your table schemas: +All recipes show GIQL queries that you can transpile and execute on your database. +Setup: .. code-block:: python - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load your data - engine.load_csv("features_a", "file_a.bed") - engine.load_csv("features_b", "file_b.bed") - - # Register schemas with genomic column mapping - for table in ["features_a", "features_b"]: - engine.register_table_schema( - table, - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", - "strand": "VARCHAR", - }, - genomic_column="interval", - ) - - # Now run queries from the recipes below - cursor = engine.execute("...") + from giql import transpile + + # Transpile any GIQL query to SQL + sql = transpile( + "... GIQL query from the recipes below ...", + tables=["features_a", "features_b"], + ) + + # Then execute the SQL on your database connection + # e.g., conn.execute(sql) Recipe Categories ----------------- diff --git a/docs/recipes/intersect-queries.rst b/docs/recipes/intersect-queries.rst index fee0324..ef7c022 100644 --- a/docs/recipes/intersect-queries.rst +++ b/docs/recipes/intersect-queries.rst @@ -16,13 +16,11 @@ Basic Overlap Query Find all features in table A that overlap with any feature in table B: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Identify variants that fall within gene regions. @@ -32,13 +30,11 @@ Get All Overlap Pairs Return every pair of overlapping features (may produce duplicates if one feature overlaps multiple others): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Generate a full overlap matrix for downstream analysis. @@ -47,12 +43,10 @@ Query Against a Specific Region Find features overlapping a literal genomic range: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' **Use case:** Extract all data for a specific chromosomal region. @@ -64,14 +58,12 @@ Excluding Overlaps Find features in A that do NOT overlap with any feature in B: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL - """) + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chrom IS NULL **Use case:** Find regulatory regions that don't overlap with known genes, or identify variants outside of exonic regions. @@ -81,13 +73,11 @@ Features with Any Overlap (Unique) Return each feature from A only once, regardless of how many B features it overlaps: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Get a deduplicated list of features that have at least one overlap. @@ -99,14 +89,12 @@ Count Overlapping Features Count how many B features each A feature overlaps: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - """) + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand **Use case:** Calculate how many enhancers each gene overlaps with, or count variants per feature. @@ -116,15 +104,13 @@ Filter by Overlap Count Find features that overlap at least N other features: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - HAVING COUNT(*) >= 3 - """) + SELECT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand + HAVING COUNT(*) >= 3 **Use case:** Identify hotspot regions with high feature density. @@ -136,14 +122,12 @@ Same-Strand Overlaps Find overlapping features on the same strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS b_name - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - """) + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand **Use case:** Find sense-strand overlaps for transcript analysis. @@ -152,16 +136,14 @@ Opposite-Strand Overlaps Find overlapping features on opposite strands: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS b_name - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand != b.strand - AND a.strand IN ('+', '-') - AND b.strand IN ('+', '-') - """) + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') **Use case:** Identify antisense overlaps or convergent transcription. @@ -173,16 +155,14 @@ Minimum Overlap Fraction of A Find overlaps where at least 50% of feature A is covered: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (a.end_pos - a.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (a.end - a.start) **Use case:** Ensure substantial overlap rather than just touching edges. @@ -191,16 +171,14 @@ Minimum Overlap Fraction of B Find overlaps where at least 50% of feature B is covered: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (b.end_pos - b.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (b.end - b.start) **Use case:** Find features that substantially cover smaller annotations. @@ -209,24 +187,22 @@ Reciprocal Overlap Require both features to have at least 50% mutual overlap: -.. code-block:: python - - cursor = engine.execute(""" - WITH overlap_calcs AS ( - SELECT - a.*, - b.name AS b_name, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, - (a.end_pos - a.start_pos) AS a_length, - (b.end_pos - b.start_pos) AS b_length - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - ) - SELECT * - FROM overlap_calcs - WHERE overlap_bp >= 0.5 * a_length - AND overlap_bp >= 0.5 * b_length - """) +.. code-block:: sql + + WITH overlap_calcs AS ( + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp, + (a.end - a.start) AS a_length, + (b.end - b.start) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT * + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length **Use case:** Find high-confidence overlaps where features mutually cover each other. @@ -238,13 +214,11 @@ Left Outer Join Report all features from A, with B information where available: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS overlapping_feature - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT a.*, b.name AS overlapping_feature + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Annotate features with overlap information while keeping all records. @@ -253,16 +227,14 @@ Calculate Overlap Amount Return the overlap size in base pairs: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.name AS b_name, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Quantify the extent of each overlap. @@ -271,19 +243,17 @@ Overlap with NULL Handling Report overlap amount for all A features, with 0 for non-overlapping: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.name AS b_name, - CASE - WHEN b.chromosome IS NULL THEN 0 - ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - END AS overlap_bp - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.name AS b_name, + CASE + WHEN b.chrom IS NULL THEN 0 + ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Create a complete overlap report including non-overlapping features. @@ -295,26 +265,18 @@ Union Multiple Sources Intersect A with features from multiple B tables: -.. code-block:: python - - # Load and register multiple tables first - engine.load_csv("features_b1", "file1.bed") - engine.load_csv("features_b2", "file2.bed") - engine.load_csv("features_b3", "file3.bed") - # Register schemas for each... - - cursor = engine.execute(""" - WITH all_b_features AS ( - SELECT * FROM features_b1 - UNION ALL - SELECT * FROM features_b2 - UNION ALL - SELECT * FROM features_b3 - ) - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval - """) +.. code-block:: sql + + WITH all_b_features AS ( + SELECT * FROM features_b1 + UNION ALL + SELECT * FROM features_b2 + UNION ALL + SELECT * FROM features_b3 + ) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval **Use case:** Find features overlapping any region from multiple annotation sources. @@ -323,20 +285,18 @@ Track Overlap Source Know which source table each overlap came from: -.. code-block:: python - - cursor = engine.execute(""" - WITH all_b_features AS ( - SELECT *, 'source1' AS source FROM features_b1 - UNION ALL - SELECT *, 'source2' AS source FROM features_b2 - UNION ALL - SELECT *, 'source3' AS source FROM features_b3 - ) - SELECT a.*, b.name AS overlap_name, b.source - FROM features_a a - INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval - """) +.. code-block:: sql + + WITH all_b_features AS ( + SELECT *, 'source1' AS source FROM features_b1 + UNION ALL + SELECT *, 'source2' AS source FROM features_b2 + UNION ALL + SELECT *, 'source3' AS source FROM features_b3 + ) + SELECT a.*, b.name AS overlap_name, b.source + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval **Use case:** Track which annotation database each overlap originated from. @@ -348,16 +308,14 @@ Overlap with Quality Filters Combine spatial and attribute filters: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND g.biotype = 'protein_coding' - ORDER BY v.chromosome, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' + ORDER BY v.chrom, v.start **Use case:** Find high-quality variants in protein-coding genes. @@ -366,14 +324,12 @@ Specific Target Genes Find overlaps with a specific set of genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR') - ORDER BY g.name, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR') + ORDER BY g.name, v.start **Use case:** Extract variants in clinically relevant genes. diff --git a/docs/transpilation/api-reference.rst b/docs/transpilation/api-reference.rst new file mode 100644 index 0000000..fcba984 --- /dev/null +++ b/docs/transpilation/api-reference.rst @@ -0,0 +1,13 @@ +API Reference +============= + +.. currentmodule:: giql + +.. autosummary:: + + transpile + Table + +.. autofunction:: transpile + +.. autoclass:: Table diff --git a/docs/transpilation/execution.rst b/docs/transpilation/execution.rst new file mode 100644 index 0000000..72ea9de --- /dev/null +++ b/docs/transpilation/execution.rst @@ -0,0 +1,152 @@ +Execution +========= + +How to use transpiled SQL +------------------------- + +You can write queries in the GIQL dialect and execute them on any SQL-92 +compliant database or analytics engine, without needing native GIQL support. + +With external database connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use transpiled SQL with your own database connections: + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = duckdb.connect("my_database.duckdb") + result = conn.execute(sql).fetchall() + conn.close() + +With ORMs and query builders +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Integrate transpiled SQL with SQLAlchemy or other ORMs: + +.. code-block:: python + + from sqlalchemy import create_engine, text + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + engine = create_engine("duckdb:///my_database.duckdb") + with engine.connect() as conn: + result = conn.execute(text(sql)) + for row in result: + print(row) + +Building SQL pipelines +~~~~~~~~~~~~~~~~~~~~~~ + +Use transpilation in data pipelines: + +.. code-block:: python + + from giql import transpile + + def build_intersection_query(table_a, table_b, region): + """Generate SQL for intersection query.""" + return transpile( + f""" + SELECT a.*, b.name + FROM {table_a} a + JOIN {table_b} b ON a.interval INTERSECTS b.interval + WHERE a.interval INTERSECTS '{region}' + """, + tables=[table_a, table_b], + ) + + # Use in pipeline + sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000") + # Execute sql with your preferred method + +Saving queries +~~~~~~~~~~~~~~ + +Save transpiled SQL for documentation or reuse: + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + with open("query.sql", "w") as f: + f.write(sql) + + # Later, execute saved SQL + with open("query.sql") as f: + sql = f.read() + + conn = duckdb.connect("database.duckdb") + result = conn.execute(sql).fetchall() + +Parameterized queries +~~~~~~~~~~~~~~~~~~~~~ + +Build queries with parameters: + +.. code-block:: python + + from giql import transpile + + def query_region(chrom, start, end): + """Transpile a parameterized region query.""" + region = f"{chrom}:{start}-{end}" + return transpile( + f""" + SELECT * FROM variants + WHERE interval INTERSECTS '{region}' + """, + tables=["variants"], + ) + + # Use with different regions + sql = query_region("chr1", 1000000, 2000000) + sql = query_region("chr2", 5000000, 6000000) + +Dynamic query building +~~~~~~~~~~~~~~~~~~~~~~ + +Build queries programmatically: + +.. code-block:: python + + from giql import transpile + + def build_multi_table_query(tables, target_region): + """Build a query that unions results from multiple tables.""" + union_parts = [] + for table in tables: + union_parts.append(f""" + SELECT *, '{table}' AS source FROM {table} + WHERE interval INTERSECTS '{target_region}' + """) + + query = " UNION ALL ".join(union_parts) + return transpile(query, tables=list(tables)) diff --git a/docs/transpilation/index.rst b/docs/transpilation/index.rst new file mode 100644 index 0000000..e5e743b --- /dev/null +++ b/docs/transpilation/index.rst @@ -0,0 +1,210 @@ +Transpilation +============= + +The ``giql`` Python package transpiles GIQL into SQL. + +How it works +------------ + +When you do this: + +.. code-block:: python + + from giql import transpile + + sql = transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["variants"], + ) + + print(sql) + +The transpiler performs three main steps: + +1. **Parses** the GIQL query into an abstract syntax tree (AST) to identify GIQL-specific operators +2. **Transforms** genomic operators into SQL predicates and Common Table Expressions (CTEs), and replace genomic pseudo-columns with actual column references +3. **Generates** SQL output from the modified AST + +The result is a standard SQL query that can be consumed by an execution engine that is not genome-aware. + +.. code-block:: sql + + SELECT * FROM variants + WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 + + +Examples +-------- + +Each GIQL operator expands to specific SQL patterns. + +**INTERSECTS** expands to range overlap checks: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + a.interval INTERSECTS b.interval + + .. tab-item:: SQL + + .. code-block:: sql + + a."chrom" = b."chrom" + AND a."start" < b."end" + AND a."end" > b."start" + +**CONTAINS** expands to containment checks: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + a.interval CONTAINS b.interval + + .. tab-item:: SQL + + .. code-block:: sql + + a."chrom" = b."chrom" + AND a."start" <= b."start" + AND a."end" >= b."end" + +**DISTANCE** expands to gap calculations: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + DISTANCE(a.interval, b.interval) + + .. tab-item:: SQL + + .. code-block:: sql + + CASE + WHEN a."chrom" != b."chrom" THEN NULL + WHEN a."end" <= b."start" THEN b."start" - a."end" + WHEN b."end" <= a."start" THEN a."start" - b."end" + ELSE 0 + END + +**Intersection joins** expand to inequality joins: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT v.*, g.name AS gene_name + FROM variants AS v + JOIN genes AS g + ON v."chrom" = g."chrom" + AND v."start" < g."end" + AND v."end" > g."start" + WHERE v.quality >= 30 + +**NEAREST** expands to lateral subqueries: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, reference=peaks.interval, k=5 + ) AS nearest + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL ( + SELECT + genes.*, + CASE + WHEN peaks."chrom" != genes."chrom" THEN NULL + WHEN peaks."start" < genes."end" + AND peaks."end" > genes."start" THEN 0 + WHEN peaks."end" <= genes."start" + THEN genes."start" - peaks."end" + ELSE peaks."start" - genes."end" + END AS distance + FROM genes + WHERE peaks."chrom" = genes."chrom" + ORDER BY ABS( + CASE + WHEN peaks."chrom" != genes."chrom" THEN NULL + WHEN peaks."start" < genes."end" + AND peaks."end" > genes."start" THEN 0 + WHEN peaks."end" <= genes."start" + THEN genes."start" - peaks."end" + ELSE peaks."start" - genes."end" + END + ) + LIMIT 5 + ) AS nearest + +**MERGE** expands to window-function-based clustering: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT MERGE(interval), COUNT(*) AS count + FROM features + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT + "chrom", + MIN("start") AS start, + MAX("end") AS end, + COUNT(*) AS count + FROM ( + SELECT + *, + SUM(is_new_cluster) OVER ( + PARTITION BY "chrom" + ORDER BY "start" NULLS LAST + ) AS __giql_cluster_id + FROM ( + SELECT + *, + CASE + WHEN LAG("end") OVER ( + PARTITION BY "chrom" + ORDER BY "start" NULLS LAST + ) >= "start" THEN 0 + ELSE 1 + END AS is_new_cluster + FROM features + ) AS lag_calc + ) AS clustered + GROUP BY chrom, __giql_cluster_id + ORDER BY "chrom" NULLS LAST, "start" NULLS LAST diff --git a/src/giql/__init__.py b/src/giql/__init__.py index 064f546..71e895d 100644 --- a/src/giql/__init__.py +++ b/src/giql/__init__.py @@ -1,13 +1,6 @@ """GIQL - Genomic Interval Query Language. A SQL dialect for genomic range queries. - -This package provides: - - GIQL dialect extending SQL with spatial operators (INTERSECTS, CONTAINS, WITHIN) - - CLUSTER and MERGE operations for interval grouping - - NEAREST operator for finding closest intervals - - Range parser for genomic coordinate strings - - Transpilation to standard SQL-92 compatible output """ from giql.table import Table From 9be3fdc37f655f74ee7c5393daacd453a024eeab Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:28:20 -0500 Subject: [PATCH 2/3] Add doc deps and use dependency-groups --- pyproject.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 766b52f..59d41a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ name = "giql" readme = "README.md" requires-python = ">=3.11" -[project.optional-dependencies] +[dependency-groups] dev = [ "duckdb>=1.4.0", "hypothesis>=6.0.0", @@ -37,6 +37,12 @@ dev = [ "pytest>=7.0.0", "ruff>=0.1.0", ] +docs = [ + "sphinx>=7.0", + "sphinx-autobuild>=2024.0", + "sphinx-book-theme>=1.1", + "sphinx-design>=0.6", +] [tool.hatch.metadata.hooks.custom] path = "build-hooks/metadata.py" From aff3b356f0b653ea89267fb0fe9f81925e73cab5 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:28:29 -0500 Subject: [PATCH 3/3] Simplify readme --- README.md | 140 +++++++++++++++--------------------------------------- 1 file changed, 38 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index 6b4d368..6fc0b1c 100644 --- a/README.md +++ b/README.md @@ -1,79 +1,34 @@ # GIQL - Genomic Interval Query Language -A SQL dialect for genomic range queries. Transpiles to standard SQL. + +

+ docs | + syntax | + transpiler +

+
+GIQL is an extended SQL dialect that allows you to declaratively express genomic interval operations. -## Overview +The `giql` Python package transpiles GIQL queries into standard SQL syntax for execution on any database or analytics engine. -GIQL extends SQL with spatial operators for genomic interval queries. It transpiles GIQL queries into standard SQL that can be executed on any database backend. - -GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable. - -## Features - -- **SQL-based**: Familiar SQL syntax with genomic extensions -- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships -- **Distance operators**: DISTANCE, NEAREST for proximity queries -- **Aggregation operators**: CLUSTER, MERGE for combining intervals -- **Set quantifiers**: ANY, ALL for multi-range queries -- **Transpilation**: Converts GIQL to standard SQL for execution on any backend +> **Note:** This project is in active development — APIs, syntax, and behavior may change. ## Installation -### From PyPI - -Install the latest stable release: +To install the transpiler: ```bash pip install giql ``` -Or the latest release candidate: - -```bash -pip install --pre giql -``` - -### From Source - -Clone the repository and install locally: - -```bash -# Clone the repository -git clone https://github.com/abdenlab/giql.git -cd giql - -# Install in development mode -pip install -e . - -# Or with development dependencies -pip install -e ".[dev]" -``` - -### Building Documentation - -To build the documentation locally: - -```bash -cd docs - -# Install documentation dependencies -pip install -r requirements.txt - -# Build HTML documentation -make html - -# View the documentation -# The built docs will be in docs/_build/html/ -# Open docs/_build/html/index.html in your browser -``` +## Usage (transpilation) -## Quick Start +The `giql` package transpiles GIQL queries to standard SQL. ```python from giql import transpile -# Transpile a GIQL query to standard SQL sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", tables=["peaks"], @@ -81,7 +36,7 @@ sql = transpile( print(sql) ``` -With custom column mappings: +Each table referenced in a GIQL query exposes a genomic "pseudo-column" that maps to separate logical chromosome, start, end, and strand columns. You can customize the column mappings. ```python from giql import Table, transpile @@ -98,9 +53,12 @@ sql = transpile( ) ], ) +print(sql) ``` -Execution example with DuckDB: +The transpiled SQL can be executed with fast genome-unaware databases or in-memory analytic engines like DuckDB. + +You can also use [oxbow](https://oxbow.readthedocs.io) to efficiently stream specialized genomics formats into DuckDB. ```python import duckdb @@ -108,57 +66,35 @@ import oxbow as ox from giql import transpile conn = duckdb.connect() -peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn) # streaming source + +# Load a streaming data source as a DuckDB relation +peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn) sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", tables=["peaks"], ) + +# Execute and return the output as a dataframe df = con.execute(sql).fetchdf() ``` -## Operators at a Glance - -### Spatial Relationships - -| Operator | Description | -|----------|-------------| -| `INTERSECTS` | Returns true when ranges overlap by at least one base pair | -| `CONTAINS` | Returns true when one range fully contains another | -| `WITHIN` | Returns true when one range is fully within another | - -### Distance and Proximity - -| Operator | Description | -|----------|-------------| -| `DISTANCE` | Calculate genomic distance between two intervals | -| `NEAREST` | Find k-nearest genomic features | - -### Aggregation - -| Operator | Description | -|----------|-------------| -| `CLUSTER` | Assign cluster IDs to overlapping intervals | -| `MERGE` | Combine overlapping intervals into unified regions | - -### Set Quantifiers - -| Quantifier | Description | -|------------|-------------| -| `ANY` | Match if condition holds for any of the specified ranges | -| `ALL` | Match if condition holds for all of the specified ranges | - -## Documentation - -For complete documentation, build the docs locally (see above) or visit the hosted documentation. +## Development -The documentation includes: +```bash +git clone https://github.com/abdenlab/giql.git +cd giql +uv sync +``` -- **Operator Reference**: Detailed documentation for each operator with examples -- **Recipes**: Common query patterns for intersections, distance calculations, and clustering -- **Bedtools Migration Guide**: How to replicate bedtools operations with GIQL -- **Guides**: Performance optimization, multi-backend configuration, and schema mapping +To build the documentation locally: -## Development +```bash +uv run --group docs sphinx-build docs docs/_build +# The built docs will be in docs/_build/html/ +``` -This project is in active development. +For serve the docs locally with automatic rebuild: +```bash +uv run --group docs sphinx-autobuild docs docs/_build +```