From 98046edfb273e9292a91424f4bee8aa03e84dc00 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 12 Feb 2026 23:18:34 -0500 Subject: [PATCH 1/3] Unnest and prune documentation --- docs/dialect/aggregation-operators.rst | 49 +---- docs/dialect/distance-operators.rst | 62 +------ docs/dialect/index.rst | 41 +++- docs/dialect/quantifiers.rst | 53 +----- docs/dialect/spatial-operators.rst | 75 +------- docs/guides/index.rst | 26 --- docs/guides/quickstart.rst | 175 ------------------ docs/index.rst | 42 +++-- .../syntax-reference.rst => quickstart.rst} | 120 +++++++++--- .../{advanced-queries.rst => advanced.rst} | 4 - docs/recipes/bedtools-migration.rst | 8 +- ...{clustering-queries.rst => clustering.rst} | 8 +- .../{distance-queries.rst => distance.rst} | 8 +- docs/recipes/index.rst | 43 +---- .../{intersect-queries.rst => intersect.rst} | 8 +- docs/transpilation/index.rst | 6 + .../{guides => transpilation}/performance.rst | 165 ++++++++++------- .../schema-mapping.rst | 37 ---- 18 files changed, 290 insertions(+), 640 deletions(-) delete mode 100644 docs/guides/index.rst delete mode 100644 docs/guides/quickstart.rst rename docs/{dialect/syntax-reference.rst => quickstart.rst} (58%) rename docs/recipes/{advanced-queries.rst => advanced.rst} (99%) rename docs/recipes/{clustering-queries.rst => clustering.rst} (98%) rename docs/recipes/{distance-queries.rst => distance.rst} (98%) rename docs/recipes/{intersect-queries.rst => intersect.rst} (99%) rename docs/{guides => transpilation}/performance.rst (65%) rename docs/{guides => transpilation}/schema-mapping.rst (85%) diff --git a/docs/dialect/aggregation-operators.rst b/docs/dialect/aggregation-operators.rst index cc3d5ec..585cdc4 100644 --- a/docs/dialect/aggregation-operators.rst +++ b/docs/dialect/aggregation-operators.rst @@ -5,10 +5,6 @@ Aggregation operators combine and cluster genomic intervals. These operators are essential for reducing complex interval data into summarized regions, such as merging overlapping peaks or identifying clusters of related features. -.. contents:: - :local: - :depth: 1 - .. _cluster-operator: CLUSTER @@ -149,26 +145,6 @@ Find regions with multiple overlapping features: INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id WHERE s.size >= 3 -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - Efficient window function implementation - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - Performance Notes ~~~~~~~~~~~~~~~~~ @@ -342,31 +318,10 @@ Calculate the total base pairs covered after merging: SELECT SUM(end - start) AS total_coverage FROM merged -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - -Performance Notes -~~~~~~~~~~~~~~~~~ +Notes +~~~~~ - MERGE is an aggregate operation that processes all matching rows -- For very large datasets, consider filtering by chromosome first - The operation sorts data internally, so pre-sorting is not required Related Operators diff --git a/docs/dialect/distance-operators.rst b/docs/dialect/distance-operators.rst index 216bdcb..773965a 100644 --- a/docs/dialect/distance-operators.rst +++ b/docs/dialect/distance-operators.rst @@ -1,14 +1,10 @@ -Distance and Proximity +Distance and Neighbors ====================== Distance and proximity operators calculate genomic distances and find nearest features. These operators are essential for proximity analysis, such as finding genes near regulatory elements or variants near transcription start sites. -.. contents:: - :local: - :depth: 1 - .. _distance-operator: DISTANCE @@ -97,33 +93,12 @@ Distinguish between overlapping and nearby features: CROSS JOIN genes g WHERE p.chrom = g.chrom -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - -Performance Notes -~~~~~~~~~~~~~~~~~ +Notes +~~~~~ - Always include ``WHERE a.chrom = b.chrom`` to avoid unnecessary cross-chromosome comparisons - For large datasets, consider pre-filtering by region before calculating distances -- Create indexes on chromosome and position columns for better performance Related Operators ~~~~~~~~~~~~~~~~~ @@ -332,39 +307,12 @@ Find nearby same-strand features within distance constraints: WHERE nearest.distance BETWEEN -10000 AND 10000 ORDER BY peaks.name, ABS(nearest.distance) -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - Efficient lateral join support - * - SQLite - - Partial - - Works but slower for large k values - * - PostgreSQL - - Planned - - - -Performance Notes -~~~~~~~~~~~~~~~~~ +Notes +~~~~~ - **Chromosome pre-filtering**: NEAREST automatically filters by chromosome for efficiency - **Use max_distance**: Specifying a maximum distance reduces the search space significantly - **Limit k**: Only request as many neighbors as you actually need -- **Create indexes**: Add indexes on ``(chrom, start, "end")`` for better performance - -.. code-block:: sql - - -- Create indexes for better NEAREST performance - CREATE INDEX idx_genes_position - ON genes (chrom, start, "end") Related Operators ~~~~~~~~~~~~~~~~~ diff --git a/docs/dialect/index.rst b/docs/dialect/index.rst index 48e7bb2..3d6167f 100644 --- a/docs/dialect/index.rst +++ b/docs/dialect/index.rst @@ -1,5 +1,5 @@ -Operators -========= +The GIQL Dialect +================ GIQL extends SQL with operators specifically designed for genomic interval queries. These operators enable powerful spatial reasoning over genomic coordinates without @@ -9,6 +9,29 @@ Operators are organized by functionality. All operators work across supported database backends (DuckDB, SQLite, with PostgreSQL planned). Each operator page includes a compatibility table showing backend support status. +Logical genomic range columns +----------------------------- + +GIQL allows you to reference *logical* genomic range columns in queries. Such logical +columns do not need to exist explicitly or materially in any of your data sources, +and no specialized composite data types are needed. Rather, a logical genomic range +column or "pseudo-column" can be mapped to physical columns in your source table that +contain the required information and use conventional data types (reference sequence name, +start and end coordinates, optional strand, etc.). + +In GIQL queries, you reference a logical genomic range column using a designated name +like ``interval``: + +.. code-block:: sql + + SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000' + +By providing a :doc:`schema mapping <../transpilation/schema-mapping>` for the genomic +range columns of each of the tables in a GIQL query, the GIQL transpiler can translate +range operations into standard SQL expressions to be consumed by a general-purpose +query engine. Alternatively, a GIQL-aware query engine could use the schema mapping +directly for optimization. + Spatial Relationship Operators ------------------------------ @@ -97,11 +120,11 @@ Apply operators to multiple ranges simultaneously. See :doc:`quantifiers` for detailed documentation. -.. toctree:: - :maxdepth: 2 - :hidden: +.. .. toctree:: +.. :maxdepth: 2 +.. :hidden: - spatial-operators - distance-operators - aggregation-operators - quantifiers +.. spatial-operators +.. distance-operators +.. aggregation-operators +.. quantifiers diff --git a/docs/dialect/quantifiers.rst b/docs/dialect/quantifiers.rst index b10a38b..956714f 100644 --- a/docs/dialect/quantifiers.rst +++ b/docs/dialect/quantifiers.rst @@ -5,10 +5,6 @@ Set quantifiers extend spatial operators to work with multiple ranges simultaneo They allow you to test whether a genomic position matches any or all of a set of specified ranges in a single query. -.. contents:: - :local: - :depth: 1 - .. _any-quantifier: ANY @@ -112,32 +108,11 @@ Query across different chromosomes efficiently: 'chrX:100000-200000' ) -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - -Performance Notes -~~~~~~~~~~~~~~~~~ +Notes +~~~~~ - ``ANY`` expands to multiple OR conditions in the generated SQL - For very large sets of ranges, consider using a separate table and JOIN instead -- The optimizer may benefit from indexes on chromosome and position columns Related ~~~~~~~ @@ -238,28 +213,8 @@ features in the intersection of multiple regions): -- This finds features that overlap BOTH ranges -- (i.e., features in the intersection: chr1:1500-2000) -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - -Performance Notes -~~~~~~~~~~~~~~~~~ +Notes +~~~~~ - ``ALL`` expands to multiple AND conditions in the generated SQL - Queries with ``ALL`` may be more restrictive, potentially reducing result sets diff --git a/docs/dialect/spatial-operators.rst b/docs/dialect/spatial-operators.rst index fa1c7be..6bf4433 100644 --- a/docs/dialect/spatial-operators.rst +++ b/docs/dialect/spatial-operators.rst @@ -1,14 +1,10 @@ -Spatial Relationships -===================== +Spatial Operators +================= Spatial relationship operators test positional relationships between genomic ranges. These are the core operators for determining whether genomic intervals overlap, contain, or are contained within other intervals. -.. contents:: - :local: - :depth: 1 - .. _intersects-operator: INTERSECTS @@ -103,33 +99,6 @@ Find all variants, with gene information where available: FROM variants v LEFT JOIN genes g ON v.interval INTERSECTS g.interval -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - Targeted for future release - -Performance Notes -~~~~~~~~~~~~~~~~~ - -- Create indexes on ``(chrom, start, "end")`` for better join performance -- When joining large tables, consider filtering by chromosome first -- The generated SQL uses efficient range comparison predicates - Related Operators ~~~~~~~~~~~~~~~~~ @@ -220,26 +189,6 @@ Find variants that are completely within gene boundaries: FROM variants v INNER JOIN genes g ON g.interval CONTAINS v.interval -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - Related Operators ~~~~~~~~~~~~~~~~~ @@ -316,26 +265,6 @@ Find exons that are completely within their parent gene: FROM exons e INNER JOIN genes g ON e.interval WITHIN g.interval -Backend Compatibility -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Support - - Notes - * - DuckDB - - Full - - - * - SQLite - - Full - - - * - PostgreSQL - - Planned - - - Related Operators ~~~~~~~~~~~~~~~~~ diff --git a/docs/guides/index.rst b/docs/guides/index.rst deleted file mode 100644 index b7644d1..0000000 --- a/docs/guides/index.rst +++ /dev/null @@ -1,26 +0,0 @@ -Guides -====== - -Task-oriented guides for working with GIQL. These guides cover common workflows -and best practices for using GIQL effectively. - -.. toctree:: - :maxdepth: 2 - :hidden: - - schema-mapping - engine - performance - -:doc:`schema-mapping` - Learn how to configure GIQL to work with your genomic data, including - table configuration and mapping logical genomic columns. - -:doc:`engine` - Understand how to use GIQL's transpiled SQL with different - execution engines like DuckDB, SQLite, and PostgreSQL. - -:doc:`performance` - Optimize your GIQL queries for better performance with indexing strategies, - query patterns, and backend-specific tips. - diff --git a/docs/guides/quickstart.rst b/docs/guides/quickstart.rst deleted file mode 100644 index ef7c3ae..0000000 --- a/docs/guides/quickstart.rst +++ /dev/null @@ -1,175 +0,0 @@ -Quick Start -=========== - -GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing -you to express complex genomic range operations without writing intricate -SQL expressions. GIQL queries read naturally, making your analysis code -easier to review and share. GIQL operators follow established conventions -around genomic spatial relationships, so the semantics are familiar and -predictable. - -- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships -- **Distance operators**: DISTANCE, NEAREST for proximity queries -- **Aggregation operators**: CLUSTER, MERGE for combining intervals -- **Set quantifiers**: ANY, ALL for multi-range queries -- **Range parsing**: Understands genomic range strings and coordinate systems -- **Transpilation**: Converts GIQL to standard SQL-92 compatible output for execution on any backend - -Installation ------------- - -Install GIQL using pip: - -.. code-block:: bash - - pip install giql - -Basic Usage ------------ - -Table Configuration -~~~~~~~~~~~~~~~~~~~ - -GIQL works with genomic data stored in tables with separate columns for chromosome, -start position, and end position. The default column names are: - -* **chrom**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') -* **start**: Start position of the genomic interval (0-based, inclusive) -* **end**: End position of the genomic interval (0-based, exclusive, half-open) -* **strand** (optional): Strand orientation ('+', '-', or '.') - -If your table uses the default column names, you can pass just the table name -as a string. For custom column names, use a ``Table`` object: - -.. code-block:: python - - from giql import Table, transpile - - # Default column names (chrom, start, end, strand) - sql = transpile(query, tables=["peaks"]) - - # Custom column names - sql = transpile( - query, - tables=[ - Table( - "variants", - genomic_col="interval", - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - ) - ], - ) - -After configuration, you can use the genomic pseudo-column (default: ``interval``) -in your GIQL queries, and the transpiler will automatically expand it to the -physical column comparisons. - -Query with DuckDB -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - import duckdb - from giql import transpile - - sql = transpile( - """ - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """, - tables=["variants"], - ) - - conn = duckdb.connect() - conn.execute("CREATE TABLE variants AS SELECT * FROM read_csv('variants.csv')") - df = conn.execute(sql).fetchdf() - -Query with SQLite -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - import sqlite3 - from giql import transpile - - sql = transpile( - """ - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """, - tables=["variants"], - ) - - conn = sqlite3.connect("data.db") - cursor = conn.execute(sql) - for row in cursor: - print(row) - -Spatial Operators ------------------ - -INTERSECTS -~~~~~~~~~~ - -Check if genomic ranges overlap: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - -CONTAINS -~~~~~~~~ - -Check if a range contains a point or another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS 'chr1:1500' - -WITHIN -~~~~~~ - -Check if a range is within another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval WITHIN 'chr1:1000-5000' - -Set Quantifiers ---------------- - -ANY -~~~ - -Match any of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - -ALL -~~~ - -Match all of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') - -Column-to-Column Joins ----------------------- - -Join tables on genomic position: - -.. code-block:: sql - - SELECT v.*, g.name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval diff --git a/docs/index.rst b/docs/index.rst index 417faad..83b2834 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,11 +5,11 @@ Genomic Interval Query Language (GIQL) :hidden: Home - guides/quickstart + quickstart **GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations. -See the :doc:`guides/quickstart` to get started. +See the :doc:`quickstart` to get started. Dialect ------- @@ -19,37 +19,51 @@ than how. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable. +See the :doc:`GIQL dialect ` docs. + .. toctree:: - :maxdepth: 1 + :hidden: :caption: Dialect - dialect/index - dialect/syntax-reference + Overview + dialect/spatial-operators + dialect/distance-operators + dialect/aggregation-operators + dialect/quantifiers Transpilation ------------- The ``giql`` package *transpiles* queries written in GIQL to regular SQL -for use in existing database systems and analytics engines. +for use in existing database systems (like SQLite or PostgreSQL), data +warehouses, or analytics engines (like Polars and DuckDB). + +See the :doc:`GIQL transpiler ` docs. .. toctree:: - :maxdepth: 1 + :hidden: :caption: Transpilation - transpilation/index + Overview + transpilation/schema-mapping transpilation/execution + transpilation/performance transpilation/api-reference Learn more ---------- -See the following guides to learn how to use GIQL effectively: +See the following :doc:`recipes ` to learn how to use GIQL effectively. .. toctree:: - :maxdepth: 1 - :caption: Guides and Recipes - - guides/index - recipes/index + :hidden: + :caption: Recipes + + Overview + recipes/intersect + recipes/distance + recipes/clustering + recipes/advanced + recipes/bedtools-migration Indices and tables diff --git a/docs/dialect/syntax-reference.rst b/docs/quickstart.rst similarity index 58% rename from docs/dialect/syntax-reference.rst rename to docs/quickstart.rst index 0082e26..32f292f 100644 --- a/docs/dialect/syntax-reference.rst +++ b/docs/quickstart.rst @@ -1,40 +1,112 @@ -Syntax Reference -================ +Quick Start +=========== -Quick reference for GIQL syntax and operators. +GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing +you to express complex genomic range operations without writing intricate +SQL expressions. GIQL queries read naturally, making your analysis code +easier to review and share. GIQL operators follow established conventions +around genomic spatial relationships, so the semantics are familiar and +predictable. -.. contents:: - :local: - :depth: 1 +- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships +- **Distance operators**: DISTANCE, NEAREST for proximity queries +- **Aggregation operators**: CLUSTER, MERGE for combining intervals +- **Set quantifiers**: ANY, ALL for multi-range queries +- **Range parsing**: Understands genomic range strings and coordinate systems +- **Transpilation**: Converts GIQL to standard SQL-92 compatible output for execution on any backend -Genomic Range Literals ----------------------- +Installation +------------ -Format -~~~~~~ +Install GIQL using pip: -Genomic ranges are specified as string literals: +.. code-block:: bash -.. code-block:: text + pip install giql - 'chromosome:start-end' +Basic Usage +----------- -Examples -~~~~~~~~ +Table Configuration +~~~~~~~~~~~~~~~~~~~ -.. code-block:: sql +GIQL works with genomic data stored in tables with separate columns for chromosome, +start position, and end position. The default column names are: + +* **chrom**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') +* **start**: Start position of the genomic interval (0-based, inclusive) +* **end**: End position of the genomic interval (0-based, exclusive, half-open) +* **strand** (optional): Strand orientation ('+', '-', or '.') + +If your table uses the default column names, you can pass just the table name +as a string. For custom column names, use a ``Table`` object: - 'chr1:1000-2000' -- Range on chr1 from 1000 to 2000 - 'chr1:1000' -- Point at position 1000 - 'chrX:50000-100000' -- Range on chrX - 'chr1:0-1000000' -- First megabase of chr1 +.. code-block:: python -Coordinate System + from giql import Table, transpile + + # Default column names (chrom, start, end, strand) + sql = transpile(query, tables=["peaks"]) + + # Custom column names + sql = transpile( + query, + tables=[ + Table( + "variants", + genomic_col="interval", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + ) + ], + ) + +After configuration, you can use the genomic pseudo-column (default: ``interval``) +in your GIQL queries, and the transpiler will automatically expand it to the +physical column comparisons. + +Query with DuckDB ~~~~~~~~~~~~~~~~~ -- **0-based start**: First base is position 0 -- **Half-open interval**: [start, end) - start inclusive, end exclusive -- Range ``chr1:100-200`` covers bases 100 through 199 +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = duckdb.connect() + conn.execute("CREATE TABLE variants AS SELECT * FROM read_csv('variants.csv')") + df = conn.execute(sql).fetchdf() + +Query with SQLite +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import sqlite3 + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = sqlite3.connect("data.db") + cursor = conn.execute(sql) + for row in cursor: + print(row) + Spatial Operators ----------------- diff --git a/docs/recipes/advanced-queries.rst b/docs/recipes/advanced.rst similarity index 99% rename from docs/recipes/advanced-queries.rst rename to docs/recipes/advanced.rst index 62147f6..4205bc1 100644 --- a/docs/recipes/advanced-queries.rst +++ b/docs/recipes/advanced.rst @@ -4,10 +4,6 @@ Advanced Queries This section covers advanced query patterns including multi-range matching, complex filtering, aggregate statistics, and multi-table workflows. -.. contents:: - :local: - :depth: 2 - Multi-Range Matching -------------------- diff --git a/docs/recipes/bedtools-migration.rst b/docs/recipes/bedtools-migration.rst index 4a00011..8e7c691 100644 --- a/docs/recipes/bedtools-migration.rst +++ b/docs/recipes/bedtools-migration.rst @@ -5,10 +5,6 @@ This guide maps bedtools commands to their GIQL equivalents. If you're familiar with bedtools and want to replicate specific operations in GIQL, use this reference to find the corresponding query patterns. -.. contents:: - :local: - :depth: 2 - Quick Reference Table --------------------- @@ -634,8 +630,8 @@ Key Differences from Bedtools 4. **Built-in aggregation**: SQL's GROUP BY and aggregate functions (COUNT, AVG, SUM, etc.) are available directly, without needing separate post-processing. -5. **Database integration**: GIQL queries run against database tables, enabling - integration with other data and persistence of results. +5. **Database integration**: GIQL queries can be run against database tables, + enabling integration with other data and persistence of results. 6. **Multi-backend support**: The same GIQL query can run on DuckDB, SQLite, or other supported backends without modification. diff --git a/docs/recipes/clustering-queries.rst b/docs/recipes/clustering.rst similarity index 98% rename from docs/recipes/clustering-queries.rst rename to docs/recipes/clustering.rst index 3dbd682..fc613e9 100644 --- a/docs/recipes/clustering-queries.rst +++ b/docs/recipes/clustering.rst @@ -1,13 +1,9 @@ -Clustering and Merging Queries -============================== +Clustering and Merging +====================== This section covers patterns for clustering overlapping intervals and merging them into unified regions using GIQL's aggregation operators. -.. contents:: - :local: - :depth: 2 - Basic Clustering ---------------- diff --git a/docs/recipes/distance-queries.rst b/docs/recipes/distance.rst similarity index 98% rename from docs/recipes/distance-queries.rst rename to docs/recipes/distance.rst index c71a4ee..2f1f352 100644 --- a/docs/recipes/distance-queries.rst +++ b/docs/recipes/distance.rst @@ -1,13 +1,9 @@ -Distance and Proximity Queries -============================== +Distance and Neighbors +====================== This section covers patterns for calculating genomic distances and finding nearest features using GIQL's distance operators. -.. contents:: - :local: - :depth: 2 - Calculating Distances --------------------- diff --git a/docs/recipes/index.rst b/docs/recipes/index.rst index 5597846..cc97e47 100644 --- a/docs/recipes/index.rst +++ b/docs/recipes/index.rst @@ -4,45 +4,22 @@ Recipes This section provides practical examples and patterns for common genomic analysis tasks using GIQL. Each recipe focuses on a specific use case with ready-to-use query patterns. -.. contents:: - :local: - :depth: 1 - -Getting Started with Recipes ----------------------------- - -All recipes show GIQL queries that you can transpile and execute on your database. -Setup: - -.. code-block:: python - - from giql import transpile - - # Transpile any GIQL query to SQL - sql = transpile( - "... GIQL query from the recipes below ...", - tables=["features_a", "features_b"], - ) - - # Then execute the SQL on your database connection - # e.g., conn.execute(sql) - Recipe Categories ----------------- -:doc:`intersect-queries` +:doc:`intersect` Finding overlapping features, filtering by overlap, counting overlaps, strand-specific operations, and join patterns. -:doc:`distance-queries` +:doc:`distance` Calculating distances between features, finding nearest neighbors, distance-constrained searches, and directional queries. -:doc:`clustering-queries` +:doc:`clustering` Clustering overlapping intervals, distance-based clustering, merging intervals, and aggregating cluster statistics. -:doc:`advanced-queries` +:doc:`advanced` Multi-range matching, complex filtering with joins, aggregate statistics, window expansions, and multi-table queries. @@ -51,14 +28,4 @@ Coming from Bedtools? If you're familiar with bedtools and want to replicate specific commands in GIQL, see the :doc:`bedtools-migration` guide for a complete mapping of bedtools -operations to GIQL equivalents. - -.. toctree:: - :maxdepth: 2 - :hidden: - - intersect-queries - distance-queries - clustering-queries - advanced-queries - bedtools-migration +operations to GIQL equivalents. \ No newline at end of file diff --git a/docs/recipes/intersect-queries.rst b/docs/recipes/intersect.rst similarity index 99% rename from docs/recipes/intersect-queries.rst rename to docs/recipes/intersect.rst index ef7c022..2872ab7 100644 --- a/docs/recipes/intersect-queries.rst +++ b/docs/recipes/intersect.rst @@ -1,13 +1,9 @@ -Intersection Queries -==================== +Intersection +============ This section covers common patterns for finding overlapping genomic features using GIQL's spatial operators. -.. contents:: - :local: - :depth: 2 - Finding Overlapping Features ---------------------------- diff --git a/docs/transpilation/index.rst b/docs/transpilation/index.rst index e5e743b..14b58b6 100644 --- a/docs/transpilation/index.rst +++ b/docs/transpilation/index.rst @@ -32,6 +32,12 @@ The result is a standard SQL query that can be consumed by an execution engine t SELECT * FROM variants WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 +Notably, the transpiler expands logical genomic range columns into physical column comparisons. + +The ``Table`` configuration of ``"variants"`` tells GIQL which physical columns correspond to +the logical ``interval`` column. The above example simply maps to the default column names: +``chrom``, ``start``, ``end``. + Examples -------- diff --git a/docs/guides/performance.rst b/docs/transpilation/performance.rst similarity index 65% rename from docs/guides/performance.rst rename to docs/transpilation/performance.rst index 019416e..028a990 100644 --- a/docs/guides/performance.rst +++ b/docs/transpilation/performance.rst @@ -1,43 +1,118 @@ -Performance Guide -================= - -This guide covers strategies for optimizing GIQL query performance, including -indexing, query patterns, and backend-specific optimizations. - -.. contents:: - :local: - :depth: 1 - -Understanding Query Performance -------------------------------- - -How GIQL Queries Execute -~~~~~~~~~~~~~~~~~~~~~~~~ +Performance +----------- When you use GIQL: 1. GIQL parses the query and identifies genomic operators 2. Operators are expanded into SQL predicates -3. You execute the SQL on your database backend -4. The database executes the query using its optimizer - -Performance depends on both the generated SQL and how the database executes it. +3. You execute the SQL on your database backend or analytics engine +4. The system optimizes the query and executes it -Common Performance Bottlenecks -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Performance depends on both the generated SQL and how the target data system +plans, optimizes, and executes it. Some common performance bottlenecks include: - **Full table scans**: No indexes to speed up filtering - **Cartesian products**: Large cross joins without early filtering - **Missing chromosome filters**: Comparing features across all chromosomes - **Inefficient join order**: Small tables should drive joins -Indexing Strategies -------------------- +Streaming +--------- + +Analytics engines like DuckDB and Polars support streaming data sources +in sequences of small "record batches", enabling parallel processing and +out-of-core workflows on files that may be much larger than memory. + +For delimited text files, you can use native APIs: + +**DuckDB:** + +.. code-block:: python + + import duckdb + from giql import transpile + + conn = duckdb.connect() + + # DuckDB can query CSV/TSV files directly + conn.execute(""" + CREATE VIEW peaks AS + SELECT * FROM read_csv('peaks.bed', delim='\t', + columns={'chrom': 'VARCHAR', 'start': 'INTEGER', + 'end': 'INTEGER', 'name': 'VARCHAR', + 'score': 'INTEGER', 'strand': 'VARCHAR'}) + """) -Creating Indexes -~~~~~~~~~~~~~~~~ + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + ) + + df = conn.execute(sql).fetchdf() + +**Polars:** + +.. code-block:: python + + import polars as pl + from giql import transpile -Create indexes on genomic columns for faster queries: + lf = pl.scan_csv("peaks.bed", separator="\t", + new_columns=["chrom", "start", "end", "name", "score", "strand"]) + + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + ) + + ctx = pl.SQLContext(peaks=lf) + df = ctx.execute(sql).collect() + +For specialized NGS formats, you can supply streaming data using the +`oxbow `_ package: + +.. code-block:: python + + import duckdb + import oxbow as ox + from giql import transpile + + conn = duckdb.connect() + + # Load a streaming data source as a DuckDB relation + peaks = ox.read_bed("peaks.bed").to_duckdb(conn, "peaks") + + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + ) + + df = conn.execute(sql).fetchdf() + +.. code-block:: python + + import polars as pl + import oxbow as ox + from giql import transpile + + # Load a streaming data source as a Polars LazyFrame + lf = ox.read_bed("peaks.bed").pl(lazy=True) + + sql = transpile( + """SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start + """, + tables=["peaks"], + ) + ctx = pl.SQLContext(peaks=lf) + ctx.execute(sql).sink_parquet("peaks_clustered.parquet") + +Indexing +-------- + +If your data source is a database table, you can create indexes on +genomic columns for faster queries: .. code-block:: sql @@ -45,9 +120,6 @@ Create indexes on genomic columns for faster queries: CREATE INDEX idx_features_position ON features (chrom, start, "end") -Recommended Index Patterns -~~~~~~~~~~~~~~~~~~~~~~~~~~ - **For single-table queries (filtering):** .. code-block:: sql @@ -68,12 +140,9 @@ Recommended Index Patterns CREATE INDEX idx_features_strand ON features (chrom, strand, start, "end") -When to Create Indexes -~~~~~~~~~~~~~~~~~~~~~~ - Create indexes when: -- Tables have more than ~10,000 rows +- Tables are very large - You're running repeated queries on the same tables - Join queries are slow - Filtering by genomic position is common @@ -142,9 +211,6 @@ DISTINCT can be expensive. Only use when necessary: WHERE a.interval INTERSECTS b.interval ) -NEAREST Query Optimization --------------------------- - Optimizing K-NN Queries ~~~~~~~~~~~~~~~~~~~~~~~ @@ -179,9 +245,6 @@ The NEAREST operator can be expensive for large datasets. Optimize with: CREATE INDEX idx_genes_position ON genes (chrom, start, "end") -Merge and Cluster Optimization ------------------------------- - Efficient Clustering ~~~~~~~~~~~~~~~~~~~~ @@ -273,27 +336,3 @@ SQLite Optimizations .. code-block:: sql ANALYZE features - -Performance Checklist ---------------------- - -Before running large queries, check: - -.. code-block:: text - - - Indexes created on genomic columns - - Chromosome filtering included in joins - - Selective filters applied early - - LIMIT used for exploration - - Only necessary columns selected - - NEAREST queries use max_distance - - Results streamed instead of fetched all at once - -Quick Wins -~~~~~~~~~~ - -1. **Add indexes** - Usually the biggest performance improvement -2. **Filter by chromosome** - Reduces join complexity significantly -3. **Use max_distance with NEAREST** - Limits search space -4. **Stream results** - Reduces memory pressure -5. **Use DuckDB** - Generally faster for analytical queries diff --git a/docs/guides/schema-mapping.rst b/docs/transpilation/schema-mapping.rst similarity index 85% rename from docs/guides/schema-mapping.rst rename to docs/transpilation/schema-mapping.rst index 43c580c..453c289 100644 --- a/docs/guides/schema-mapping.rst +++ b/docs/transpilation/schema-mapping.rst @@ -4,21 +4,11 @@ Schema Mapping This guide explains how to configure GIQL to work with your genomic data by defining table configurations that map logical genomic columns to physical columns. -.. contents:: - :local: - :depth: 1 - -Understanding Schema Mapping ----------------------------- - GIQL needs to know how your genomic data is structured in order to translate genomic operators into SQL. This is done through ``Table`` objects, which map a logical "genomic column" (used in your queries) to the physical columns in your files, data frames, or database tables. -The Core Concept -~~~~~~~~~~~~~~~~ - In GIQL queries, you use a logical genomic column name like ``interval``: .. code-block:: sql @@ -95,9 +85,6 @@ the mapping: ], ) -Multiple Tables ---------------- - Configuring Multiple Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -197,27 +184,3 @@ For point features (like SNPs), create an interval of length 1: # 0-based interval: [999, 1000) start = 999 end = 1000 - - -Troubleshooting ---------------- - -Common Issues -~~~~~~~~~~~~~ - -**"Unknown column" errors:** - -- Ensure the table is included in the ``tables`` parameter -- Check that the genomic column name in your query matches the configured name -- Verify column names in the ``Table`` object match actual table columns - -**Incorrect results:** - -- Verify your coordinate system (0-based vs 1-based) -- Check that start < end for all intervals -- Ensure chromosome names match between tables (e.g., 'chr1' vs '1') - -**Performance issues:** - -- See the :doc:`performance` guide for optimization tips -- Consider adding indexes on genomic columns \ No newline at end of file From c3dfc4bd11f4df251310f067979a9d9e9a943104 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 12 Feb 2026 23:52:27 -0500 Subject: [PATCH 2/3] Update demo notebook --- demo.ipynb | 2892 +++++++++++++++++++++++++--------------------------- 1 file changed, 1382 insertions(+), 1510 deletions(-) diff --git a/demo.ipynb b/demo.ipynb index e7f02c3..5a6705e 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -4,18 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GIQL operators demo using DuckDB query engine\n", + "# GIQL operators demo\n", "\n", "This notebook demonstrates the 7 core GIQL operators:\n", "- **Binary predicates**: INTERSECTS, WITHIN, CONTAINS\n", "- **Aggregation operators**: MERGE, CLUSTER\n", - "- **UDF operators**: DISTANCE\n", - "- **Table-valued functions**: NEAREST\n", + "- **Distance operators**: DISTANCE, NEAREST\n", "\n", "For each operator, we:\n", - "1. Create a GIQL query\n", - "2. Transpile it to standard SQL using GIQL\n", - "3. Execute the SQL using DuckDB" + "1. Write a GIQL query\n", + "2. Transpile it to standard SQL\n", + "3. Execute the SQL with DuckDB" ] }, { @@ -24,10 +23,11 @@ "metadata": {}, "outputs": [], "source": [ - "import polars as pl\n", + "import duckdb\n", + "import sqlglot\n", "import sqlparse\n", "\n", - "from giql import GIQLEngine" + "from giql import transpile" ] }, { @@ -36,7 +36,7 @@ "source": [ "## Load BED files\n", "\n", - "Load the two ENCODE ChIP-seq peak files as Polars DataFrames." + "Load two ENCODE ChIP-seq peak files into DuckDB." ] }, { @@ -49,36 +49,126 @@ "output_type": "stream", "text": [ "Features A: 60893 intervals\n", - "Features B: 46196 intervals\n", - "\n", - "Features A preview:\n" + "Features B: 46196 intervals\n" ] }, { "data": { "text/html": [ - "
\n", - "shape: (5, 10)
chromosomestart_posend_posnamescorestrandsignal_valuep_valueq_valuepeak
stri64i64stri64strf64f64f64i64
"chr17"2738103227381306"."1000"."461.10586-1.05.08726136
"chr7"2556593225566199"."1000"."457.60096-1.05.08726133
"chr7"143215117143215391"."1000"."443.34712-1.05.08726139
"chr2"164600030164600306"."1000"."442.64446-1.05.08726137
"chr15"5624604656246359"."1000"."438.56423-1.05.08726154
" + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromstartendnamescorestrandsignal_valuep_valueq_valuepeak
0chr172738103227381306.1000.461.10586-1.05.08726136
1chr72556593225566199.1000.457.60096-1.05.08726133
2chr7143215117143215391.1000.443.34712-1.05.08726139
3chr2164600030164600306.1000.442.64446-1.05.08726137
4chr155624604656246359.1000.438.56423-1.05.08726154
\n", + "" ], "text/plain": [ - "shape: (5, 10)\n", - "┌────────────┬───────────┬───────────┬──────┬───┬──────────────┬─────────┬─────────┬──────┐\n", - "│ chromosome ┆ start_pos ┆ end_pos ┆ name ┆ … ┆ signal_value ┆ p_value ┆ q_value ┆ peak │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ i64 ┆ i64 ┆ str ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │\n", - "╞════════════╪═══════════╪═══════════╪══════╪═══╪══════════════╪═════════╪═════════╪══════╡\n", - "│ chr17 ┆ 27381032 ┆ 27381306 ┆ . ┆ … ┆ 461.10586 ┆ -1.0 ┆ 5.08726 ┆ 136 │\n", - "│ chr7 ┆ 25565932 ┆ 25566199 ┆ . ┆ … ┆ 457.60096 ┆ -1.0 ┆ 5.08726 ┆ 133 │\n", - "│ chr7 ┆ 143215117 ┆ 143215391 ┆ . ┆ … ┆ 443.34712 ┆ -1.0 ┆ 5.08726 ┆ 139 │\n", - "│ chr2 ┆ 164600030 ┆ 164600306 ┆ . ┆ … ┆ 442.64446 ┆ -1.0 ┆ 5.08726 ┆ 137 │\n", - "│ chr15 ┆ 56246046 ┆ 56246359 ┆ . ┆ … ┆ 438.56423 ┆ -1.0 ┆ 5.08726 ┆ 154 │\n", - "└────────────┴───────────┴───────────┴──────┴───┴──────────────┴─────────┴─────────┴──────┘" + " chrom start end name score strand signal_value p_value \\\n", + "0 chr17 27381032 27381306 . 1000 . 461.10586 -1.0 \n", + "1 chr7 25565932 25566199 . 1000 . 457.60096 -1.0 \n", + "2 chr7 143215117 143215391 . 1000 . 443.34712 -1.0 \n", + "3 chr2 164600030 164600306 . 1000 . 442.64446 -1.0 \n", + "4 chr15 56246046 56246359 . 1000 . 438.56423 -1.0 \n", + "\n", + " q_value peak \n", + "0 5.08726 136 \n", + "1 5.08726 133 \n", + "2 5.08726 139 \n", + "3 5.08726 137 \n", + "4 5.08726 154 " ] }, "execution_count": 2, @@ -87,84 +177,28 @@ } ], "source": [ - "# Load BED files with standard BED column names\n", - "bed_schema = {\n", - " \"chromosome\": pl.Utf8,\n", - " \"start_pos\": pl.Int64,\n", - " \"end_pos\": pl.Int64,\n", - " \"name\": pl.Utf8,\n", - " \"score\": pl.Int64,\n", - " \"strand\": pl.Utf8,\n", - " \"signal_value\": pl.Float64,\n", - " \"p_value\": pl.Float64,\n", - " \"q_value\": pl.Float64,\n", - " \"peak\": pl.Int64,\n", - "}\n", - "\n", - "# Load first BED file\n", - "features_a = pl.read_csv(\n", - " \".ENCFF199YFA.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n", - ")\n", - "\n", - "# Load second BED file\n", - "features_b = pl.read_csv(\n", - " \".ENCFF205OKL.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n", - ")\n", - "\n", - "print(f\"Features A: {len(features_a)} intervals\")\n", - "print(f\"Features B: {len(features_b)} intervals\")\n", - "print(\"\\nFeatures A preview:\")\n", - "features_a.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup GIQL engine\n", - "\n", - "Create a GIQL engine and register the table schemas." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GIQL engine configured successfully\n" - ] - } - ], - "source": [ - "# Create GIQL engine with DuckDB backend\n", - "engine = GIQLEngine(target_dialect=\"duckdb\", verbose=False)\n", - "\n", - "# Register DataFrames using Arrow (preferred for DuckDB)\n", - "engine.conn.register(\"features_a\", features_a.to_arrow())\n", - "engine.conn.register(\"features_b\", features_b.to_arrow())\n", - "\n", - "# Register schemas with genomic columns\n", - "schema_dict = {\n", - " \"chromosome\": \"VARCHAR\",\n", - " \"start_pos\": \"BIGINT\",\n", - " \"end_pos\": \"BIGINT\",\n", - " \"name\": \"VARCHAR\",\n", - " \"score\": \"BIGINT\",\n", - " \"strand\": \"VARCHAR\",\n", - " \"signal_value\": \"DOUBLE\",\n", - " \"p_value\": \"DOUBLE\",\n", - " \"q_value\": \"DOUBLE\",\n", - " \"peak\": \"BIGINT\",\n", - "}\n", - "\n", - "for table in [\"features_a\", \"features_b\"]:\n", - " engine.register_table_schema(table, schema_dict, genomic_column=\"interval\")\n", - "\n", - "print(\"GIQL engine configured successfully\")" + "conn = duckdb.connect()\n", + "\n", + "# Load BED files with standard column names\n", + "bed_columns = \"\"\"columns={\n", + " 'chrom': 'VARCHAR', 'start': 'INTEGER', 'end': 'INTEGER',\n", + " 'name': 'VARCHAR', 'score': 'INTEGER', 'strand': 'VARCHAR',\n", + " 'signal_value': 'DOUBLE', 'p_value': 'DOUBLE',\n", + " 'q_value': 'DOUBLE', 'peak': 'INTEGER'\n", + "}\"\"\"\n", + "\n", + "for table_name, file_path in [\n", + " (\"features_a\", \".ENCFF199YFA.bed\"),\n", + " (\"features_b\", \".ENCFF205OKL.bed\"),\n", + "]:\n", + " conn.execute(f\"\"\"\n", + " CREATE TABLE {table_name} AS\n", + " SELECT * FROM read_csv('{file_path}', delim='\\t', header=false, {bed_columns})\n", + " \"\"\")\n", + "\n", + "print(\"Features A:\", conn.execute(\"SELECT COUNT(*) FROM features_a\").fetchone()[0], \"intervals\")\n", + "print(\"Features B:\", conn.execute(\"SELECT COUNT(*) FROM features_b\").fetchone()[0], \"intervals\")\n", + "conn.execute(\"SELECT * FROM features_a LIMIT 5\").fetchdf()" ] }, { @@ -182,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -194,9 +228,9 @@ "FROM\n", " features_a AS a,\n", " features_b AS b\n", - "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", - " AND a.\"start_pos\" < b.\"end_pos\"\n", - " AND a.\"end_pos\" > b.\"start_pos\")\n", + "WHERE (a.\"chrom\" = b.\"chrom\"\n", + " AND a.\"start\" < b.\"end\"\n", + " AND a.\"end\" > b.\"start\")\n", "\n", "================================================================================\n", "\n", @@ -224,9 +258,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " name\n", " score\n", " strand\n", @@ -239,165 +273,165 @@ " \n", " \n", " 0\n", - " chr11\n", - " 34359322\n", - " 34359560\n", + " chr22\n", + " 37933998\n", + " 37934242\n", " .\n", " 1000\n", " .\n", - " 220.19056\n", + " 283.80615\n", " -1.0\n", " 5.08726\n", - " 117\n", + " 123\n", " \n", " \n", " 1\n", - " chr17\n", - " 18625675\n", - " 18625936\n", + " chr20\n", + " 267021\n", + " 267251\n", " .\n", " 1000\n", " .\n", - " 218.02293\n", + " 258.60836\n", " -1.0\n", " 5.08726\n", - " 133\n", + " 126\n", " \n", " \n", " 2\n", - " chr16\n", - " 67883801\n", - " 67884000\n", + " chr20\n", + " 63537989\n", + " 63538200\n", " .\n", " 1000\n", " .\n", - " 219.30369\n", + " 272.24062\n", " -1.0\n", " 5.08726\n", - " 102\n", + " 98\n", " \n", " \n", " 3\n", " chr14\n", - " 94036611\n", - " 94036828\n", + " 25042978\n", + " 25043164\n", " .\n", " 1000\n", " .\n", - " 220.16568\n", + " 259.47262\n", " -1.0\n", " 5.08726\n", - " 116\n", + " 106\n", " \n", " \n", " 4\n", - " chr8\n", - " 110060410\n", - " 110060629\n", + " chr15\n", + " 64728285\n", + " 64728479\n", " .\n", " 1000\n", " .\n", - " 219.42024\n", + " 234.42653\n", " -1.0\n", " 5.08726\n", - " 106\n", + " 96\n", " \n", " \n", " 5\n", - " chr9\n", - " 135967630\n", - " 135967829\n", + " chr20\n", + " 34129746\n", + " 34129928\n", " .\n", " 1000\n", " .\n", - " 218.66102\n", + " 243.36217\n", " -1.0\n", " 5.08726\n", - " 92\n", + " 101\n", " \n", " \n", " 6\n", - " chr14\n", - " 94136789\n", - " 94136976\n", + " chr20\n", + " 48657805\n", + " 48658001\n", " .\n", " 1000\n", " .\n", - " 218.30192\n", + " 257.44302\n", " -1.0\n", " 5.08726\n", - " 99\n", + " 93\n", " \n", " \n", " 7\n", - " chr20\n", - " 24918423\n", - " 24918598\n", + " chr14\n", + " 38721933\n", + " 38722118\n", " .\n", " 1000\n", " .\n", - " 218.92314\n", + " 239.09691\n", " -1.0\n", " 5.08726\n", - " 94\n", + " 98\n", " \n", " \n", " 8\n", - " chr2\n", - " 202178449\n", - " 202178624\n", + " chr14\n", + " 23632038\n", + " 23632231\n", " .\n", " 1000\n", " .\n", - " 217.36688\n", + " 255.61314\n", " -1.0\n", " 5.08726\n", - " 90\n", + " 91\n", " \n", " \n", " 9\n", - " chr7\n", - " 120987697\n", - " 120987871\n", + " chr16\n", + " 11532213\n", + " 11532470\n", " .\n", " 1000\n", " .\n", - " 217.34347\n", + " 339.88370\n", " -1.0\n", " 5.08726\n", - " 85\n", + " 124\n", " \n", " \n", "\n", "" ], "text/plain": [ - " chromosome start_pos end_pos name score strand signal_value p_value \\\n", - "0 chr11 34359322 34359560 . 1000 . 220.19056 -1.0 \n", - "1 chr17 18625675 18625936 . 1000 . 218.02293 -1.0 \n", - "2 chr16 67883801 67884000 . 1000 . 219.30369 -1.0 \n", - "3 chr14 94036611 94036828 . 1000 . 220.16568 -1.0 \n", - "4 chr8 110060410 110060629 . 1000 . 219.42024 -1.0 \n", - "5 chr9 135967630 135967829 . 1000 . 218.66102 -1.0 \n", - "6 chr14 94136789 94136976 . 1000 . 218.30192 -1.0 \n", - "7 chr20 24918423 24918598 . 1000 . 218.92314 -1.0 \n", - "8 chr2 202178449 202178624 . 1000 . 217.36688 -1.0 \n", - "9 chr7 120987697 120987871 . 1000 . 217.34347 -1.0 \n", + " chrom start end name score strand signal_value p_value \\\n", + "0 chr22 37933998 37934242 . 1000 . 283.80615 -1.0 \n", + "1 chr20 267021 267251 . 1000 . 258.60836 -1.0 \n", + "2 chr20 63537989 63538200 . 1000 . 272.24062 -1.0 \n", + "3 chr14 25042978 25043164 . 1000 . 259.47262 -1.0 \n", + "4 chr15 64728285 64728479 . 1000 . 234.42653 -1.0 \n", + "5 chr20 34129746 34129928 . 1000 . 243.36217 -1.0 \n", + "6 chr20 48657805 48658001 . 1000 . 257.44302 -1.0 \n", + "7 chr14 38721933 38722118 . 1000 . 239.09691 -1.0 \n", + "8 chr14 23632038 23632231 . 1000 . 255.61314 -1.0 \n", + "9 chr16 11532213 11532470 . 1000 . 339.88370 -1.0 \n", "\n", " q_value peak \n", - "0 5.08726 117 \n", - "1 5.08726 133 \n", - "2 5.08726 102 \n", - "3 5.08726 116 \n", - "4 5.08726 106 \n", - "5 5.08726 92 \n", - "6 5.08726 99 \n", - "7 5.08726 94 \n", - "8 5.08726 90 \n", - "9 5.08726 85 " + "0 5.08726 123 \n", + "1 5.08726 126 \n", + "2 5.08726 98 \n", + "3 5.08726 106 \n", + "4 5.08726 96 \n", + "5 5.08726 101 \n", + "6 5.08726 93 \n", + "7 5.08726 98 \n", + "8 5.08726 91 \n", + "9 5.08726 124 " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -411,7 +445,7 @@ "\"\"\"\n", "\n", "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query)\n", "print(\"Transpiled SQL:\")\n", "print(\n", " sqlparse.format(\n", @@ -421,7 +455,7 @@ "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", + "cursor = conn.execute(sql)\n", "result = cursor.df() # Get result as pandas DataFrame\n", "print(f\"Result: {len(result)} overlapping intervals from A\")\n", "result.head(10)" @@ -438,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -446,33 +480,17 @@ "output_type": "stream", "text": [ "Transpiled SQL:\n", - "WITH overlap_counts AS\n", - " (SELECT\n", - " a.chromosome,\n", - " a.start_pos,\n", - " a.end_pos,\n", - " a.signal_value,\n", - " COUNT(*) AS depth\n", - " FROM\n", - " features_a AS a,\n", - " features_b AS b\n", - " WHERE (a.\"chromosome\" = b.\"chromosome\"\n", - " AND a.\"start_pos\" < b.\"end_pos\"\n", - " AND a.\"end_pos\" > b.\"start_pos\")\n", - " GROUP BY\n", - " a.chromosome,\n", - " a.start_pos,\n", - " a.end_pos,\n", - " a.signal_value)\n", - "SELECT *\n", - "FROM overlap_counts\n", - "WHERE depth > 2\n", - "ORDER BY depth DESC\n", + "SELECT DISTINCT\n", + " a.*\n", + "FROM features_a AS a, features_b AS b\n", + "WHERE\n", + " (\n", + " a.\"chrom\" = b.\"chrom\" AND a.\"start\" < b.\"end\" AND a.\"end\" > b.\"start\"\n", + " )\n", "\n", "================================================================================\n", "\n", - "Result: 2 intervals with overlap depth >= 5\n", - "Max overlap depth: 3\n" + "Result: 42616 overlapping intervals from A\n" ] }, { @@ -496,80 +514,198 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", + " name\n", + " score\n", + " strand\n", " signal_value\n", - " depth\n", + " p_value\n", + " q_value\n", + " peak\n", " \n", " \n", " \n", " \n", " 0\n", - " chr14\n", - " 105999349\n", - " 105999944\n", - " 240.51649\n", - " 3\n", + " chr19\n", + " 45642140\n", + " 45642400\n", + " .\n", + " 1000\n", + " .\n", + " 297.57428\n", + " -1.0\n", + " 5.08726\n", + " 128\n", " \n", " \n", " 1\n", + " chr11\n", + " 518674\n", + " 518971\n", + " .\n", + " 1000\n", + " .\n", + " 315.46469\n", + " -1.0\n", + " 5.08726\n", + " 149\n", + " \n", + " \n", + " 2\n", + " chr20\n", + " 50794798\n", + " 50795030\n", + " .\n", + " 1000\n", + " .\n", + " 260.64664\n", + " -1.0\n", + " 5.08726\n", + " 115\n", + " \n", + " \n", + " 3\n", + " chr19\n", + " 46395603\n", + " 46395857\n", + " .\n", + " 1000\n", + " .\n", + " 335.49048\n", + " -1.0\n", + " 5.08726\n", + " 134\n", + " \n", + " \n", + " 4\n", + " chr9\n", + " 34646905\n", + " 34647110\n", + " .\n", + " 1000\n", + " .\n", + " 235.66436\n", + " -1.0\n", + " 5.08726\n", + " 100\n", + " \n", + " \n", + " 5\n", " chr14\n", - " 105999349\n", - " 105999944\n", - " 108.06722\n", - " 3\n", + " 53929955\n", + " 53930179\n", + " .\n", + " 1000\n", + " .\n", + " 340.46918\n", + " -1.0\n", + " 5.08726\n", + " 115\n", + " \n", + " \n", + " 6\n", + " chr12\n", + " 6323309\n", + " 6323518\n", + " .\n", + " 1000\n", + " .\n", + " 262.07653\n", + " -1.0\n", + " 5.08726\n", + " 99\n", + " \n", + " \n", + " 7\n", + " chr14\n", + " 32073682\n", + " 32073873\n", + " .\n", + " 1000\n", + " .\n", + " 278.18167\n", + " -1.0\n", + " 5.08726\n", + " 89\n", + " \n", + " \n", + " 8\n", + " chr20\n", + " 59087928\n", + " 59088111\n", + " .\n", + " 1000\n", + " .\n", + " 234.98679\n", + " -1.0\n", + " 5.08726\n", + " 92\n", + " \n", + " \n", + " 9\n", + " chr15\n", + " 79596083\n", + " 79596368\n", + " .\n", + " 1000\n", + " .\n", + " 347.14815\n", + " -1.0\n", + " 5.08726\n", + " 144\n", " \n", " \n", "\n", "" ], "text/plain": [ - " chromosome start_pos end_pos signal_value depth\n", - "0 chr14 105999349 105999944 240.51649 3\n", - "1 chr14 105999349 105999944 108.06722 3" + " chrom start end name score strand signal_value p_value \\\n", + "0 chr19 45642140 45642400 . 1000 . 297.57428 -1.0 \n", + "1 chr11 518674 518971 . 1000 . 315.46469 -1.0 \n", + "2 chr20 50794798 50795030 . 1000 . 260.64664 -1.0 \n", + "3 chr19 46395603 46395857 . 1000 . 335.49048 -1.0 \n", + "4 chr9 34646905 34647110 . 1000 . 235.66436 -1.0 \n", + "5 chr14 53929955 53930179 . 1000 . 340.46918 -1.0 \n", + "6 chr12 6323309 6323518 . 1000 . 262.07653 -1.0 \n", + "7 chr14 32073682 32073873 . 1000 . 278.18167 -1.0 \n", + "8 chr20 59087928 59088111 . 1000 . 234.98679 -1.0 \n", + "9 chr15 79596083 79596368 . 1000 . 347.14815 -1.0 \n", + "\n", + " q_value peak \n", + "0 5.08726 128 \n", + "1 5.08726 149 \n", + "2 5.08726 115 \n", + "3 5.08726 134 \n", + "4 5.08726 100 \n", + "5 5.08726 115 \n", + "6 5.08726 99 \n", + "7 5.08726 89 \n", + "8 5.08726 92 \n", + "9 5.08726 144 " ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Define GIQL query to find high-depth overlaps\n", "giql_query = \"\"\"\n", - " WITH overlap_counts AS (\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos,\n", - " a.end_pos,\n", - " a.signal_value,\n", - " COUNT(*) as depth\n", - " FROM features_a a, features_b b\n", - " WHERE a.interval INTERSECTS b.interval\n", - " GROUP BY a.chromosome, a.start_pos, a.end_pos, a.signal_value\n", - " )\n", - " SELECT *\n", - " FROM overlap_counts\n", - " WHERE depth > 2\n", - " ORDER BY depth DESC\n", - "\"\"\"\n", - "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + " SELECT DISTINCT a.*\n", + " FROM features_a a, features_b b\n", + " WHERE a.interval INTERSECTS b.interval\n", + "\"\"\"\n", + "\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", - "print(f\"Result: {len(result)} intervals with overlap depth >= 5\")\n", - "print(f\"Max overlap depth: {result['depth'].max() if len(result) > 0 else 0}\")\n", + "result = conn.execute(sql).fetchdf()\n", + "print(f\"Result: {len(result)} overlapping intervals from A\")\n", "result.head(10)" ] }, @@ -586,6 +722,151 @@ "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval WITHIN b.interval`" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "WITH overlap_counts AS (\n", + " SELECT\n", + " a.chrom,\n", + " a.start,\n", + " a.\"end\",\n", + " a.signal_value,\n", + " COUNT(*) AS depth\n", + " FROM features_a AS a, features_b AS b\n", + " WHERE\n", + " (\n", + " a.\"chrom\" = b.\"chrom\" AND a.\"start\" < b.\"end\" AND a.\"end\" > b.\"start\"\n", + " )\n", + " GROUP BY\n", + " a.chrom,\n", + " a.start,\n", + " a.\"end\",\n", + " a.signal_value\n", + ")\n", + "SELECT\n", + " *\n", + "FROM overlap_counts\n", + "WHERE\n", + " depth > 2\n", + "ORDER BY\n", + " depth DESC\n", + "\n", + "================================================================================\n", + "\n", + "Result: 2 intervals with overlap depth > 2\n", + "Max overlap depth: 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromstartendsignal_valuedepth
0chr14105999349105999944240.516493
1chr14105999349105999944108.067223
\n", + "
" + ], + "text/plain": [ + " chrom start end signal_value depth\n", + "0 chr14 105999349 105999944 240.51649 3\n", + "1 chr14 105999349 105999944 108.06722 3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "giql_query = \"\"\"\n", + " WITH overlap_counts AS (\n", + " SELECT\n", + " a.chrom,\n", + " a.start,\n", + " a.\"end\",\n", + " a.signal_value,\n", + " COUNT(*) as depth\n", + " FROM features_a a, features_b b\n", + " WHERE a.interval INTERSECTS b.interval\n", + " GROUP BY a.chrom, a.start, a.\"end\", a.signal_value\n", + " )\n", + " SELECT *\n", + " FROM overlap_counts\n", + " WHERE depth > 2\n", + " ORDER BY depth DESC\n", + "\"\"\"\n", + "\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", + "print(\"Transpiled SQL:\")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "result = conn.execute(sql).fetchdf()\n", + "print(f\"Result: {len(result)} intervals with overlap depth > 2\")\n", + "print(f\"Max overlap depth: {result['depth'].max() if len(result) > 0 else 0}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 3. CONTAINS operator\n", + "\n", + "Find features in A that **completely contain** features in B.\n", + "\n", + "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval CONTAINS b.interval`" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -599,12 +880,11 @@ "SELECT DISTINCT\n", " a.*,\n", " b.*\n", - "FROM\n", - " features_a AS a,\n", - " features_b AS b\n", - "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", - " AND a.\"start_pos\" >= b.\"start_pos\"\n", - " AND a.\"end_pos\" <= b.\"end_pos\")\n", + "FROM features_a AS a, features_b AS b\n", + "WHERE\n", + " (\n", + " a.\"chrom\" = b.\"chrom\" AND a.\"start\" >= b.\"start\" AND a.\"end\" <= b.\"end\"\n", + " )\n", "\n", "================================================================================\n", "\n", @@ -632,9 +912,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " name\n", " score\n", " strand\n", @@ -642,9 +922,9 @@ " p_value\n", " q_value\n", " peak\n", - " chromosome_1\n", - " start_pos_1\n", - " end_pos_1\n", + " chrom_1\n", + " start_1\n", + " end_1\n", " name_1\n", " score_1\n", " strand_1\n", @@ -657,274 +937,274 @@ " \n", " \n", " 0\n", - " chr14\n", - " 24216011\n", - " 24216203\n", + " chr19\n", + " 43774483\n", + " 43774712\n", " .\n", " 1000\n", " .\n", - " 251.87452\n", + " 252.61723\n", " -1.0\n", " 5.08726\n", - " 95\n", - " chr14\n", - " 24215977\n", - " 24216234\n", + " 103\n", + " chr19\n", + " 43774432\n", + " 43774742\n", " .\n", - " 1000\n", + " 951\n", " .\n", - " 209.90753\n", + " 28.41530\n", " -1.0\n", " 5.10518\n", - " 125\n", + " 155\n", " \n", " \n", " 1\n", - " chr14\n", - " 33792757\n", - " 33792965\n", + " chr17\n", + " 75615499\n", + " 75615705\n", " .\n", " 1000\n", " .\n", - " 252.08812\n", + " 244.31776\n", " -1.0\n", " 5.08726\n", - " 105\n", - " chr14\n", - " 33792735\n", - " 33793013\n", + " 96\n", + " chr17\n", + " 75615479\n", + " 75615789\n", " .\n", " 1000\n", " .\n", - " 232.27970\n", + " 26.62768\n", " -1.0\n", " 5.10518\n", - " 134\n", + " 155\n", " \n", " \n", " 2\n", - " chr6\n", - " 43769540\n", - " 43769748\n", + " chr11\n", + " 397249\n", + " 397474\n", " .\n", " 1000\n", " .\n", - " 250.99653\n", + " 237.92944\n", " -1.0\n", " 5.08726\n", - " 104\n", - " chr6\n", - " 43769499\n", - " 43769760\n", + " 114\n", + " chr11\n", + " 397230\n", + " 397540\n", " .\n", " 1000\n", " .\n", - " 154.83893\n", + " 31.74873\n", " -1.0\n", " 5.10518\n", - " 141\n", + " 155\n", " \n", " \n", " 3\n", - " chr17\n", - " 49861200\n", - " 49861405\n", + " chr20\n", + " 44116742\n", + " 44116950\n", " .\n", " 1000\n", " .\n", - " 250.16995\n", + " 292.51551\n", " -1.0\n", " 5.08726\n", - " 93\n", - " chr17\n", - " 49861195\n", - " 49861409\n", + " 106\n", + " chr20\n", + " 44116709\n", + " 44116975\n", " .\n", " 1000\n", " .\n", - " 78.03419\n", + " 201.73737\n", " -1.0\n", " 5.10518\n", - " 86\n", + " 132\n", " \n", " \n", " 4\n", - " chr2\n", - " 219476345\n", - " 219476531\n", + " chr16\n", + " 23434388\n", + " 23434589\n", " .\n", " 1000\n", " .\n", - " 248.18364\n", + " 267.80248\n", " -1.0\n", " 5.08726\n", - " 102\n", - " chr2\n", - " 219476292\n", - " 219476557\n", + " 106\n", + " chr16\n", + " 23434384\n", + " 23434621\n", " .\n", " 1000\n", " .\n", - " 160.44163\n", + " 148.78159\n", " -1.0\n", " 5.10518\n", - " 125\n", + " 119\n", " \n", " \n", " 5\n", - " chr11\n", - " 57527230\n", - " 57527419\n", + " chr17\n", + " 35804783\n", + " 35805037\n", " .\n", " 1000\n", " .\n", - " 247.38897\n", + " 314.27261\n", " -1.0\n", " 5.08726\n", - " 104\n", - " chr11\n", - " 57527209\n", - " 57527456\n", + " 134\n", + " chr17\n", + " 35804768\n", + " 35805048\n", " .\n", " 1000\n", " .\n", - " 121.89660\n", + " 113.42031\n", " -1.0\n", " 5.10518\n", - " 134\n", + " 149\n", " \n", " \n", " 6\n", - " chr2\n", - " 152025859\n", - " 152026122\n", + " chr4\n", + " 147866849\n", + " 147867113\n", " .\n", " 1000\n", " .\n", - " 247.10786\n", + " 395.15579\n", " -1.0\n", " 5.08726\n", - " 146\n", - " chr2\n", - " 152025676\n", - " 152026161\n", + " 133\n", + " chr4\n", + " 147866840\n", + " 147867128\n", " .\n", " 1000\n", " .\n", - " 181.47229\n", + " 191.86545\n", " -1.0\n", " 5.10518\n", - " 304\n", + " 141\n", " \n", " \n", " 7\n", - " chr2\n", - " 112259523\n", - " 112259722\n", + " chr10\n", + " 97759515\n", + " 97759702\n", " .\n", " 1000\n", " .\n", - " 249.77072\n", + " 236.46727\n", " -1.0\n", " 5.08726\n", - " 95\n", - " chr2\n", - " 112259464\n", - " 112259743\n", + " 97\n", + " chr10\n", + " 97759495\n", + " 97759724\n", " .\n", " 1000\n", " .\n", - " 219.16901\n", + " 140.81146\n", " -1.0\n", " 5.10518\n", - " 141\n", + " 104\n", " \n", " \n", " 8\n", - " chr21\n", - " 43810840\n", - " 43811034\n", + " chr12\n", + " 53701870\n", + " 53702056\n", " .\n", " 1000\n", " .\n", - " 201.96817\n", + " 253.82117\n", " -1.0\n", " 5.08726\n", - " 102\n", - " chr21\n", - " 43810814\n", - " 43811039\n", + " 96\n", + " chr12\n", + " 53701848\n", + " 53702069\n", " .\n", " 1000\n", " .\n", - " 98.62279\n", + " 131.82871\n", " -1.0\n", " 5.10518\n", - " 131\n", + " 97\n", " \n", " \n", " 9\n", - " chr17\n", - " 44331388\n", - " 44331569\n", + " chr12\n", + " 115398399\n", + " 115398644\n", " .\n", " 1000\n", " .\n", - " 200.30254\n", + " 296.60121\n", " -1.0\n", " 5.08726\n", - " 100\n", - " chr17\n", - " 44331387\n", - " 44331617\n", + " 110\n", + " chr12\n", + " 115398396\n", + " 115398671\n", " .\n", " 1000\n", " .\n", - " 100.15563\n", + " 135.44415\n", " -1.0\n", " 5.10518\n", - " 116\n", + " 138\n", " \n", " \n", "\n", "" ], "text/plain": [ - " chromosome start_pos end_pos name score strand signal_value p_value \\\n", - "0 chr14 24216011 24216203 . 1000 . 251.87452 -1.0 \n", - "1 chr14 33792757 33792965 . 1000 . 252.08812 -1.0 \n", - "2 chr6 43769540 43769748 . 1000 . 250.99653 -1.0 \n", - "3 chr17 49861200 49861405 . 1000 . 250.16995 -1.0 \n", - "4 chr2 219476345 219476531 . 1000 . 248.18364 -1.0 \n", - "5 chr11 57527230 57527419 . 1000 . 247.38897 -1.0 \n", - "6 chr2 152025859 152026122 . 1000 . 247.10786 -1.0 \n", - "7 chr2 112259523 112259722 . 1000 . 249.77072 -1.0 \n", - "8 chr21 43810840 43811034 . 1000 . 201.96817 -1.0 \n", - "9 chr17 44331388 44331569 . 1000 . 200.30254 -1.0 \n", + " chrom start end name score strand signal_value p_value \\\n", + "0 chr19 43774483 43774712 . 1000 . 252.61723 -1.0 \n", + "1 chr17 75615499 75615705 . 1000 . 244.31776 -1.0 \n", + "2 chr11 397249 397474 . 1000 . 237.92944 -1.0 \n", + "3 chr20 44116742 44116950 . 1000 . 292.51551 -1.0 \n", + "4 chr16 23434388 23434589 . 1000 . 267.80248 -1.0 \n", + "5 chr17 35804783 35805037 . 1000 . 314.27261 -1.0 \n", + "6 chr4 147866849 147867113 . 1000 . 395.15579 -1.0 \n", + "7 chr10 97759515 97759702 . 1000 . 236.46727 -1.0 \n", + "8 chr12 53701870 53702056 . 1000 . 253.82117 -1.0 \n", + "9 chr12 115398399 115398644 . 1000 . 296.60121 -1.0 \n", "\n", - " q_value peak chromosome_1 start_pos_1 end_pos_1 name_1 score_1 \\\n", - "0 5.08726 95 chr14 24215977 24216234 . 1000 \n", - "1 5.08726 105 chr14 33792735 33793013 . 1000 \n", - "2 5.08726 104 chr6 43769499 43769760 . 1000 \n", - "3 5.08726 93 chr17 49861195 49861409 . 1000 \n", - "4 5.08726 102 chr2 219476292 219476557 . 1000 \n", - "5 5.08726 104 chr11 57527209 57527456 . 1000 \n", - "6 5.08726 146 chr2 152025676 152026161 . 1000 \n", - "7 5.08726 95 chr2 112259464 112259743 . 1000 \n", - "8 5.08726 102 chr21 43810814 43811039 . 1000 \n", - "9 5.08726 100 chr17 44331387 44331617 . 1000 \n", + " q_value peak chrom_1 start_1 end_1 name_1 score_1 strand_1 \\\n", + "0 5.08726 103 chr19 43774432 43774742 . 951 . \n", + "1 5.08726 96 chr17 75615479 75615789 . 1000 . \n", + "2 5.08726 114 chr11 397230 397540 . 1000 . \n", + "3 5.08726 106 chr20 44116709 44116975 . 1000 . \n", + "4 5.08726 106 chr16 23434384 23434621 . 1000 . \n", + "5 5.08726 134 chr17 35804768 35805048 . 1000 . \n", + "6 5.08726 133 chr4 147866840 147867128 . 1000 . \n", + "7 5.08726 97 chr10 97759495 97759724 . 1000 . \n", + "8 5.08726 96 chr12 53701848 53702069 . 1000 . \n", + "9 5.08726 110 chr12 115398396 115398671 . 1000 . \n", "\n", - " strand_1 signal_value_1 p_value_1 q_value_1 peak_1 \n", - "0 . 209.90753 -1.0 5.10518 125 \n", - "1 . 232.27970 -1.0 5.10518 134 \n", - "2 . 154.83893 -1.0 5.10518 141 \n", - "3 . 78.03419 -1.0 5.10518 86 \n", - "4 . 160.44163 -1.0 5.10518 125 \n", - "5 . 121.89660 -1.0 5.10518 134 \n", - "6 . 181.47229 -1.0 5.10518 304 \n", - "7 . 219.16901 -1.0 5.10518 141 \n", - "8 . 98.62279 -1.0 5.10518 131 \n", - "9 . 100.15563 -1.0 5.10518 116 " + " signal_value_1 p_value_1 q_value_1 peak_1 \n", + "0 28.41530 -1.0 5.10518 155 \n", + "1 26.62768 -1.0 5.10518 155 \n", + "2 31.74873 -1.0 5.10518 155 \n", + "3 201.73737 -1.0 5.10518 132 \n", + "4 148.78159 -1.0 5.10518 119 \n", + "5 113.42031 -1.0 5.10518 149 \n", + "6 191.86545 -1.0 5.10518 141 \n", + "7 140.81146 -1.0 5.10518 104 \n", + "8 131.82871 -1.0 5.10518 97 \n", + "9 135.44415 -1.0 5.10518 138 " ] }, "execution_count": 6, @@ -933,26 +1213,18 @@ } ], "source": [ - "# Define GIQL query\n", "giql_query = \"\"\"\n", " SELECT DISTINCT a.*, b.*\n", " FROM features_a a, features_b b\n", " WHERE a.interval WITHIN b.interval\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} intervals from A contained within B\")\n", "result.head(10)" ] @@ -963,11 +1235,13 @@ "source": [ "---\n", "\n", - "## 3. CONTAINS operator\n", + "## 4. MERGE operator\n", "\n", - "Find features in A that **completely contain** features in B.\n", + "**Combine overlapping intervals** from features_a into merged regions.\n", "\n", - "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval CONTAINS b.interval`" + "Similar to `bedtools merge`, this collapses overlapping genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT MERGE(interval) FROM features_a`" ] }, { @@ -980,13 +1254,13 @@ "output_type": "stream", "text": [ "Transpiled SQL:\n", - "SELECT DISTINCT a.*\n", - "FROM\n", - " features_a AS a,\n", - " features_b AS b\n", - "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", - " AND a.\"start_pos\" <= b.\"start_pos\"\n", - " AND a.\"end_pos\" >= b.\"end_pos\")\n", + "SELECT DISTINCT\n", + " a.*\n", + "FROM features_a AS a, features_b AS b\n", + "WHERE\n", + " (\n", + " a.\"chrom\" = b.\"chrom\" AND a.\"start\" <= b.\"start\" AND a.\"end\" >= b.\"end\"\n", + " )\n", "\n", "================================================================================\n", "\n", @@ -1014,9 +1288,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " name\n", " score\n", " strand\n", @@ -1029,162 +1303,162 @@ " \n", " \n", " 0\n", - " chr4\n", - " 102431434\n", - " 102431659\n", + " chr22\n", + " 22559162\n", + " 22559461\n", " .\n", " 1000\n", " .\n", - " 299.13707\n", + " 378.39470\n", " -1.0\n", " 5.08726\n", - " 111\n", + " 143\n", " \n", " \n", " 1\n", - " chr14\n", - " 49586912\n", - " 49587154\n", + " chr19\n", + " 45642140\n", + " 45642400\n", " .\n", " 1000\n", " .\n", - " 303.26119\n", + " 297.57428\n", " -1.0\n", " 5.08726\n", - " 126\n", + " 128\n", " \n", " \n", " 2\n", - " chr5\n", - " 134039564\n", - " 134039867\n", + " chr19\n", + " 46395603\n", + " 46395857\n", " .\n", " 1000\n", " .\n", - " 317.39854\n", + " 335.49048\n", " -1.0\n", " 5.08726\n", - " 157\n", + " 134\n", " \n", " \n", " 3\n", - " chr18\n", - " 77900138\n", - " 77900404\n", + " chr12\n", + " 6323309\n", + " 6323518\n", " .\n", " 1000\n", " .\n", - " 314.56958\n", + " 262.07653\n", " -1.0\n", " 5.08726\n", - " 127\n", + " 99\n", " \n", " \n", " 4\n", - " chrX\n", - " 123960372\n", - " 123960620\n", + " chr12\n", + " 546987\n", + " 547224\n", " .\n", " 1000\n", " .\n", - " 291.07538\n", + " 273.49144\n", " -1.0\n", " 5.08726\n", - " 123\n", + " 126\n", " \n", " \n", " 5\n", - " chr1\n", - " 157311233\n", - " 157311386\n", + " chr6\n", + " 34154821\n", + " 34155022\n", " .\n", " 1000\n", " .\n", - " 188.57350\n", + " 258.93713\n", " -1.0\n", " 5.08726\n", - " 80\n", + " 95\n", " \n", " \n", " 6\n", - " chr21\n", - " 32403252\n", - " 32403390\n", + " chr5\n", + " 181184125\n", + " 181184440\n", " .\n", " 1000\n", " .\n", - " 132.27311\n", + " 277.71855\n", " -1.0\n", " 5.08726\n", - " 80\n", + " 138\n", " \n", " \n", " 7\n", - " chr14\n", - " 104564131\n", - " 104564266\n", + " chr16\n", + " 11532213\n", + " 11532470\n", " .\n", " 1000\n", " .\n", - " 131.97784\n", + " 339.88370\n", " -1.0\n", " 5.08726\n", - " 68\n", + " 124\n", " \n", " \n", " 8\n", - " chr10\n", - " 132204836\n", - " 132204945\n", + " chr5\n", + " 95781099\n", + " 95781372\n", " .\n", " 1000\n", " .\n", - " 114.77888\n", + " 255.82894\n", " -1.0\n", " 5.08726\n", - " 59\n", + " 132\n", " \n", " \n", " 9\n", - " chrX\n", - " 54453024\n", - " 54453130\n", + " chr12\n", + " 111396988\n", + " 111397250\n", " .\n", " 1000\n", " .\n", - " 102.02848\n", + " 406.49219\n", " -1.0\n", " 5.08726\n", - " 65\n", + " 130\n", " \n", " \n", "\n", "" ], "text/plain": [ - " chromosome start_pos end_pos name score strand signal_value p_value \\\n", - "0 chr4 102431434 102431659 . 1000 . 299.13707 -1.0 \n", - "1 chr14 49586912 49587154 . 1000 . 303.26119 -1.0 \n", - "2 chr5 134039564 134039867 . 1000 . 317.39854 -1.0 \n", - "3 chr18 77900138 77900404 . 1000 . 314.56958 -1.0 \n", - "4 chrX 123960372 123960620 . 1000 . 291.07538 -1.0 \n", - "5 chr1 157311233 157311386 . 1000 . 188.57350 -1.0 \n", - "6 chr21 32403252 32403390 . 1000 . 132.27311 -1.0 \n", - "7 chr14 104564131 104564266 . 1000 . 131.97784 -1.0 \n", - "8 chr10 132204836 132204945 . 1000 . 114.77888 -1.0 \n", - "9 chrX 54453024 54453130 . 1000 . 102.02848 -1.0 \n", + " chrom start end name score strand signal_value p_value \\\n", + "0 chr22 22559162 22559461 . 1000 . 378.39470 -1.0 \n", + "1 chr19 45642140 45642400 . 1000 . 297.57428 -1.0 \n", + "2 chr19 46395603 46395857 . 1000 . 335.49048 -1.0 \n", + "3 chr12 6323309 6323518 . 1000 . 262.07653 -1.0 \n", + "4 chr12 546987 547224 . 1000 . 273.49144 -1.0 \n", + "5 chr6 34154821 34155022 . 1000 . 258.93713 -1.0 \n", + "6 chr5 181184125 181184440 . 1000 . 277.71855 -1.0 \n", + "7 chr16 11532213 11532470 . 1000 . 339.88370 -1.0 \n", + "8 chr5 95781099 95781372 . 1000 . 255.82894 -1.0 \n", + "9 chr12 111396988 111397250 . 1000 . 406.49219 -1.0 \n", "\n", " q_value peak \n", - "0 5.08726 111 \n", - "1 5.08726 126 \n", - "2 5.08726 157 \n", - "3 5.08726 127 \n", - "4 5.08726 123 \n", - "5 5.08726 80 \n", - "6 5.08726 80 \n", - "7 5.08726 68 \n", - "8 5.08726 59 \n", - "9 5.08726 65 " + "0 5.08726 143 \n", + "1 5.08726 128 \n", + "2 5.08726 134 \n", + "3 5.08726 99 \n", + "4 5.08726 126 \n", + "5 5.08726 95 \n", + "6 5.08726 138 \n", + "7 5.08726 124 \n", + "8 5.08726 132 \n", + "9 5.08726 130 " ] }, "execution_count": 7, @@ -1193,26 +1467,18 @@ } ], "source": [ - "# Define GIQL query\n", "giql_query = \"\"\"\n", " SELECT DISTINCT a.*\n", " FROM features_a a, features_b b\n", " WHERE a.interval CONTAINS b.interval\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} intervals from A that contain B\")\n", "result.head(10)" ] @@ -1221,15 +1487,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - "\n", - "## 4. MERGE operator\n", - "\n", - "**Combine overlapping intervals** from features_a into merged regions.\n", - "\n", - "Similar to `bedtools merge`, this collapses overlapping genomic intervals.\n", + "### MERGE with aggregation CTE\n", "\n", - "**GIQL Query**: `SELECT MERGE(interval) FROM features_a`" + "Compute statistics on merged intervals and filter for intervals that merged multiple features." ] }, { @@ -1243,33 +1503,34 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " \"chromosome\",\n", - " MIN(\"start_pos\") AS start_pos,\n", - " MAX(\"end_pos\") AS end_pos\n", - "FROM\n", - " (SELECT\n", - " *,\n", - " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", - " FROM\n", - " (SELECT\n", - " *,\n", - " CASE\n", - " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", - " ELSE 1\n", - " END AS is_new_cluster\n", - " FROM features_b) AS lag_calc) AS clustered\n", + " \"chrom\",\n", + " MIN(\"start\") AS start,\n", + " MAX(\"end\") AS end\n", + "FROM (\n", + " SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) AS __giql_cluster_id\n", + " FROM (\n", + " SELECT\n", + " *,\n", + " CASE\n", + " WHEN LAG(\"end\") OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) >= \"start\"\n", + " THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b\n", + " ) AS lag_calc\n", + ") AS clustered\n", "GROUP BY\n", - " chromosome,\n", + " chrom,\n", " __giql_cluster_id\n", "ORDER BY\n", - " \"chromosome\" NULLS LAST,\n", - " \"start_pos\" NULLS LAST\n", + " \"chrom\" NULLS LAST,\n", + " \"start\" NULLS LAST\n", "\n", "================================================================================\n", "\n", - "Result: 45630 merged intervals (from 60893 original)\n" + "Result: 45630 merged intervals (from 46196 original)\n" ] }, { @@ -1293,9 +1554,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " \n", " \n", " \n", @@ -1364,17 +1625,17 @@ "" ], "text/plain": [ - " chromosome start_pos end_pos\n", - "0 chr1 186654 186964\n", - "1 chr1 267979 268128\n", - "2 chr1 850424 850734\n", - "3 chr1 869711 870021\n", - "4 chr1 912864 913174\n", - "5 chr1 919635 919945\n", - "6 chr1 931626 931936\n", - "7 chr1 938195 938352\n", - "8 chr1 951414 951724\n", - "9 chr1 976080 976390" + " chrom start end\n", + "0 chr1 186654 186964\n", + "1 chr1 267979 268128\n", + "2 chr1 850424 850734\n", + "3 chr1 869711 870021\n", + "4 chr1 912864 913174\n", + "5 chr1 919635 919945\n", + "6 chr1 931626 931936\n", + "7 chr1 938195 938352\n", + "8 chr1 951414 951724\n", + "9 chr1 976080 976390" ] }, "execution_count": 8, @@ -1383,26 +1644,19 @@ } ], "source": [ - "# Define GIQL query - basic merge\n", "giql_query = \"\"\"\n", " SELECT MERGE(interval)\n", " FROM features_b\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", - "print(f\"Result: {len(result)} merged intervals (from {len(features_a)} original)\")\n", + "n_original = conn.execute(\"SELECT COUNT(*) FROM features_b\").fetchone()[0]\n", + "result = conn.execute(sql).fetchdf()\n", + "print(f\"Result: {len(result)} merged intervals (from {n_original} original)\")\n", "result.head(10)" ] }, @@ -1410,9 +1664,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### MERGE with aggregation CTE\n", + "### MERGE with distance parameter\n", "\n", - "Compute statistics on merged intervals and filter for intervals that merged multiple features." + "Merge intervals within 1000bp of each other." ] }, { @@ -1425,46 +1679,44 @@ "output_type": "stream", "text": [ "Transpiled SQL:\n", - "WITH merged_intervals AS\n", - " (SELECT\n", - " \"chromosome\",\n", - " MIN(\"start_pos\") AS start_pos,\n", - " MAX(\"end_pos\") AS end_pos,\n", - " COUNT(*) AS interval_count,\n", - " AVG(signal_value) AS avg_signal\n", - " FROM\n", - " (SELECT\n", + "WITH merged_intervals AS (\n", + " SELECT\n", + " \"chrom\",\n", + " MIN(\"start\") AS start,\n", + " MAX(\"end\") AS end,\n", + " COUNT(*) AS interval_count,\n", + " AVG(signal_value) AS avg_signal\n", + " FROM (\n", + " SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) AS __giql_cluster_id\n", + " FROM (\n", + " SELECT\n", " *,\n", - " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", - " FROM\n", - " (SELECT\n", - " *,\n", - " CASE\n", - " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", - " ELSE 1\n", - " END AS is_new_cluster\n", - " FROM features_b) AS lag_calc) AS clustered\n", - " GROUP BY\n", - " chromosome,\n", - " __giql_cluster_id\n", - " ORDER BY\n", - " \"chromosome\" NULLS LAST,\n", - " \"start_pos\" NULLS LAST)\n", - "SELECT *\n", + " CASE\n", + " WHEN LAG(\"end\") OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) >= \"start\"\n", + " THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b\n", + " ) AS lag_calc\n", + " ) AS clustered\n", + " GROUP BY\n", + " chrom,\n", + " __giql_cluster_id\n", + " ORDER BY\n", + " \"chrom\" NULLS LAST,\n", + " \"start\" NULLS LAST\n", + ")\n", + "SELECT\n", + " *\n", "FROM merged_intervals\n", - "WHERE interval_count > 1\n", + "WHERE\n", + " interval_count > 1\n", "\n", "================================================================================\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result: 559 merged intervals with more than 1 original interval\n" + "\n", + "Result: 559 merged intervals with more than 1 original interval\n" ] }, { @@ -1488,9 +1740,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " interval_count\n", " avg_signal\n", " \n", @@ -1581,17 +1833,17 @@ "" ], "text/plain": [ - " chromosome start_pos end_pos interval_count avg_signal\n", - "0 chr1 1300073 1300592 2 14.844510\n", - "1 chr1 8374912 8375330 2 28.608395\n", - "2 chr1 9717307 9717819 2 13.833490\n", - "3 chr1 14782921 14783438 2 7.107920\n", - "4 chr1 18362141 18362677 2 9.312525\n", - "5 chr1 19445773 19446283 2 16.590025\n", - "6 chr1 23958874 23959385 2 10.616965\n", - "7 chr1 24157968 24158482 2 11.985625\n", - "8 chr1 27392817 27393359 2 11.789830\n", - "9 chr1 30106143 30106664 2 31.694800" + " chrom start end interval_count avg_signal\n", + "0 chr1 1300073 1300592 2 14.844510\n", + "1 chr1 8374912 8375330 2 28.608395\n", + "2 chr1 9717307 9717819 2 13.833490\n", + "3 chr1 14782921 14783438 2 7.107920\n", + "4 chr1 18362141 18362677 2 9.312525\n", + "5 chr1 19445773 19446283 2 16.590025\n", + "6 chr1 23958874 23959385 2 10.616965\n", + "7 chr1 24157968 24158482 2 11.985625\n", + "8 chr1 27392817 27393359 2 11.789830\n", + "9 chr1 30106143 30106664 2 31.694800" ] }, "execution_count": 9, @@ -1600,7 +1852,6 @@ } ], "source": [ - "# Define GIQL query with aggregations in a CTE\n", "giql_query = \"\"\"\n", " WITH merged_intervals AS (\n", " SELECT\n", @@ -1614,19 +1865,12 @@ " WHERE interval_count > 1\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} merged intervals with more than 1 original interval\")\n", "result.head(10)" ] @@ -1635,9 +1879,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### MERGE with distance parameter\n", + "---\n", "\n", - "Merge intervals within 1000bp of each other." + "## 5. CLUSTER operator\n", + "\n", + "**Assign cluster IDs** to overlapping intervals from features_a.\n", + "\n", + "Similar to `bedtools cluster`, this groups overlapping genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT *, CLUSTER(interval) AS cluster_id FROM features_a`" ] }, { @@ -1651,34 +1901,35 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " \"chromosome\",\n", - " MIN(\"start_pos\") AS start_pos,\n", - " MAX(\"end_pos\") AS end_pos\n", - "FROM\n", - " (SELECT\n", - " *,\n", - " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", - " FROM\n", - " (SELECT\n", - " *,\n", - " CASE\n", - " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) + 10000 >= \"start_pos\" THEN 0\n", - " ELSE 1\n", - " END AS is_new_cluster\n", - " FROM features_b) AS lag_calc) AS clustered\n", + " \"chrom\",\n", + " MIN(\"start\") AS start,\n", + " MAX(\"end\") AS end\n", + "FROM (\n", + " SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) AS __giql_cluster_id\n", + " FROM (\n", + " SELECT\n", + " *,\n", + " CASE\n", + " WHEN LAG(\"end\") OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) + 10000 >= \"start\"\n", + " THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b\n", + " ) AS lag_calc\n", + ") AS clustered\n", "GROUP BY\n", - " chromosome,\n", + " chrom,\n", " __giql_cluster_id\n", "ORDER BY\n", - " \"chromosome\" NULLS LAST,\n", - " \"start_pos\" NULLS LAST\n", + " \"chrom\" NULLS LAST,\n", + " \"start\" NULLS LAST\n", "\n", "================================================================================\n", "\n", - "Result: 34097 merged intervals (1000bp distance)\n", - "Compared to: 60893 original intervals\n" + "Result: 34097 merged intervals (10000bp distance)\n", + "Compared to: 46196 original intervals\n" ] }, { @@ -1702,9 +1953,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " \n", " \n", " \n", @@ -1773,17 +2024,17 @@ "" ], "text/plain": [ - " chromosome start_pos end_pos\n", - "0 chr1 186654 186964\n", - "1 chr1 267979 268128\n", - "2 chr1 850424 850734\n", - "3 chr1 869711 870021\n", - "4 chr1 912864 919945\n", - "5 chr1 931626 938352\n", - "6 chr1 951414 951724\n", - "7 chr1 976080 984477\n", - "8 chr1 1013987 1015656\n", - "9 chr1 1063787 1064097" + " chrom start end\n", + "0 chr1 186654 186964\n", + "1 chr1 267979 268128\n", + "2 chr1 850424 850734\n", + "3 chr1 869711 870021\n", + "4 chr1 912864 919945\n", + "5 chr1 931626 938352\n", + "6 chr1 951414 951724\n", + "7 chr1 976080 984477\n", + "8 chr1 1013987 1015656\n", + "9 chr1 1063787 1064097" ] }, "execution_count": 10, @@ -1792,27 +2043,20 @@ } ], "source": [ - "# Define GIQL query - merge with distance parameter\n", "giql_query = \"\"\"\n", " SELECT MERGE(interval, 10000)\n", " FROM features_b\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", - "print(f\"Result: {len(result)} merged intervals (1000bp distance)\")\n", - "print(f\"Compared to: {len(features_a)} original intervals\")\n", + "n_original = conn.execute(\"SELECT COUNT(*) FROM features_b\").fetchone()[0]\n", + "result = conn.execute(sql).fetchdf()\n", + "print(f\"Result: {len(result)} merged intervals (10000bp distance)\")\n", + "print(f\"Compared to: {n_original} original intervals\")\n", "result.head(10)" ] }, @@ -1820,15 +2064,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - "\n", - "## 5. CLUSTER operator\n", - "\n", - "**Assign cluster IDs** to overlapping intervals from features_a.\n", - "\n", - "Similar to `bedtools cluster`, this groups overlapping genomic intervals.\n", + "### CLUSTER with distance parameter\n", "\n", - "**GIQL Query**: `SELECT *, CLUSTER(interval) AS cluster_id FROM features_a`" + "Cluster intervals within 1000bp of each other." ] }, { @@ -1842,27 +2080,27 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", + " chrom,\n", + " start,\n", + " \"end\",\n", " signal_value,\n", - " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) AS cluster_id\n", - "FROM\n", - " (SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", - " signal_value,\n", - " CASE\n", - " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", - " ELSE 1\n", - " END AS is_new_cluster\n", - " FROM features_a) AS lag_calc\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) AS cluster_id\n", + "FROM (\n", + " SELECT\n", + " chrom,\n", + " start,\n", + " \"end\",\n", + " signal_value,\n", + " CASE\n", + " WHEN LAG(\"end\") OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) >= \"start\"\n", + " THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_a\n", + ") AS lag_calc\n", "ORDER BY\n", - " chromosome,\n", - " start_pos\n", + " chrom,\n", + " start\n", "\n", "================================================================================\n", "\n", @@ -1891,9 +2129,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " signal_value\n", " cluster_id\n", " \n", @@ -1984,17 +2222,17 @@ "" ], "text/plain": [ - " chromosome start_pos end_pos signal_value cluster_id\n", - "0 chr1 181368 181564 27.54796 1.0\n", - "1 chr1 186650 186846 19.93098 2.0\n", - "2 chr1 267909 268105 88.00863 3.0\n", - "3 chr1 586106 586302 35.02027 4.0\n", - "4 chr1 729261 729457 23.29415 5.0\n", - "5 chr1 778812 779008 56.97663 6.0\n", - "6 chr1 850473 850669 27.08243 7.0\n", - "7 chr1 858056 858252 15.55869 8.0\n", - "8 chr1 869860 869991 168.87294 9.0\n", - "9 chr1 904689 904883 167.56897 10.0" + " chrom start end signal_value cluster_id\n", + "0 chr1 181368 181564 27.54796 1.0\n", + "1 chr1 186650 186846 19.93098 2.0\n", + "2 chr1 267909 268105 88.00863 3.0\n", + "3 chr1 586106 586302 35.02027 4.0\n", + "4 chr1 729261 729457 23.29415 5.0\n", + "5 chr1 778812 779008 56.97663 6.0\n", + "6 chr1 850473 850669 27.08243 7.0\n", + "7 chr1 858056 858252 15.55869 8.0\n", + "8 chr1 869860 869991 168.87294 9.0\n", + "9 chr1 904689 904883 167.56897 10.0" ] }, "execution_count": 11, @@ -2003,31 +2241,23 @@ } ], "source": [ - "# Define GIQL query - basic clustering\n", "giql_query = \"\"\"\n", " SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", + " chrom,\n", + " start,\n", + " \"end\",\n", " signal_value,\n", " CLUSTER(interval) AS cluster_id\n", " FROM features_a\n", - " ORDER BY chromosome, start_pos\n", + " ORDER BY chrom, start\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} intervals with cluster assignments\")\n", "print(f\"Number of unique clusters: {result['cluster_id'].nunique()}\")\n", "result.head(10)" @@ -2037,9 +2267,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### CLUSTER with distance parameter\n", + "---\n", "\n", - "Cluster intervals within 1000bp of each other." + "## 6. DISTANCE operator\n", + "\n", + "**Calculate genomic distances** between intervals from `features_a` and `features_b`.\n", + "\n", + "Similar to `bedtools closest -d`, this calculates the distance in base pairs between genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT a.*, b.*, DISTANCE(a.interval, b.interval) AS distance FROM features_a a, features_b b`" ] }, { @@ -2053,31 +2289,31 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", + " chrom,\n", + " start,\n", + " \"end\",\n", " signal_value,\n", - " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) AS cluster_id\n", - "FROM\n", - " (SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", - " signal_value,\n", - " CASE\n", - " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", - " ORDER BY \"start_pos\" NULLS LAST) + 50000 >= \"start_pos\" THEN 0\n", - " ELSE 1\n", - " END AS is_new_cluster\n", - " FROM features_a) AS lag_calc\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) AS cluster_id\n", + "FROM (\n", + " SELECT\n", + " chrom,\n", + " start,\n", + " \"end\",\n", + " signal_value,\n", + " CASE\n", + " WHEN LAG(\"end\") OVER (PARTITION BY \"chrom\" ORDER BY \"start\" NULLS LAST) + 50000 >= \"start\"\n", + " THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_a\n", + ") AS lag_calc\n", "ORDER BY\n", - " chromosome,\n", - " start_pos\n", + " chrom,\n", + " start\n", "\n", "================================================================================\n", "\n", - "Result: 60893 intervals with cluster assignments (1000bp distance)\n", + "Result: 60893 intervals with cluster assignments (50000bp distance)\n", "Number of unique clusters: 1376\n" ] }, @@ -2102,9 +2338,9 @@ " \n", " \n", " \n", - " chromosome\n", - " start_pos\n", - " end_pos\n", + " chrom\n", + " start\n", + " end\n", " signal_value\n", " cluster_id\n", " \n", @@ -2195,17 +2431,17 @@ "" ], "text/plain": [ - " chromosome start_pos end_pos signal_value cluster_id\n", - "0 chr1 181368 181564 27.54796 1.0\n", - "1 chr1 186650 186846 19.93098 1.0\n", - "2 chr1 267909 268105 88.00863 2.0\n", - "3 chr1 586106 586302 35.02027 3.0\n", - "4 chr1 729261 729457 23.29415 4.0\n", - "5 chr1 778812 779008 56.97663 4.0\n", - "6 chr1 850473 850669 27.08243 5.0\n", - "7 chr1 858056 858252 15.55869 5.0\n", - "8 chr1 869860 869991 168.87294 5.0\n", - "9 chr1 904689 904883 167.56897 5.0" + " chrom start end signal_value cluster_id\n", + "0 chr1 181368 181564 27.54796 1.0\n", + "1 chr1 186650 186846 19.93098 1.0\n", + "2 chr1 267909 268105 88.00863 2.0\n", + "3 chr1 586106 586302 35.02027 3.0\n", + "4 chr1 729261 729457 23.29415 4.0\n", + "5 chr1 778812 779008 56.97663 4.0\n", + "6 chr1 850473 850669 27.08243 5.0\n", + "7 chr1 858056 858252 15.55869 5.0\n", + "8 chr1 869860 869991 168.87294 5.0\n", + "9 chr1 904689 904883 167.56897 5.0" ] }, "execution_count": 12, @@ -2214,32 +2450,24 @@ } ], "source": [ - "# Define GIQL query with distance parameter\n", "giql_query = \"\"\"\n", " SELECT\n", - " chromosome,\n", - " start_pos,\n", - " end_pos,\n", + " chrom,\n", + " start,\n", + " \"end\",\n", " signal_value,\n", " CLUSTER(interval, 50000) AS cluster_id\n", " FROM features_a\n", - " ORDER BY chromosome, start_pos\n", + " ORDER BY chrom, start\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", - "print(f\"Result: {len(result)} intervals with cluster assignments (1000bp distance)\")\n", + "result = conn.execute(sql).fetchdf()\n", + "print(f\"Result: {len(result)} intervals with cluster assignments (50000bp distance)\")\n", "print(f\"Number of unique clusters: {result['cluster_id'].nunique()}\")\n", "result.head(10)" ] @@ -2248,15 +2476,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - "\n", - "## 6. DISTANCE operator\n", - "\n", - "**Calculate genomic distances** between intervals from `features_a` and `features_b`.\n", - "\n", - "Similar to `bedtools closest -d`, this calculates the distance in base pairs between genomic intervals.\n", + "### Signed Distance (Directional)\n", "\n", - "**GIQL Query**: `SELECT a.*, b.*, DISTANCE(a.interval, b.interval) AS distance FROM features_a a, features_b b`" + "Calculate **directional distances** where negative values indicate upstream features and positive values indicate downstream features, similar to `bedtools closest -D ref`." ] }, { @@ -2269,49 +2491,68 @@ "output_type": "stream", "text": [ "Transpiled SQL:\n", - "WITH distances AS\n", - " (SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", - " CASE\n", - " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < b.\"end_pos\"\n", - " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", - " END AS signed_distance,\n", - " ROW_NUMBER() OVER (PARTITION BY a.chromosome, a.start_pos, a.end_pos\n", - " ORDER BY ABS(CASE\n", - " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < b.\"end_pos\"\n", - " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", - " END)) AS rank\n", - " FROM features_a AS a\n", - " CROSS JOIN features_b AS b\n", - " WHERE a.chromosome = b.chromosome\n", - " AND a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000)\n", + "WITH distances AS (\n", + " SELECT\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", + " CASE\n", + " WHEN a.\"chrom\" <> b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < b.\"end\" AND a.\"end\" > b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= b.\"start\"\n", + " THEN (\n", + " b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE -(\n", + " a.\"start\" - b.\"end\"\n", + " )\n", + " END AS signed_distance,\n", + " ROW_NUMBER() OVER (\n", + " PARTITION BY a.chrom, a.start, a.\"end\"\n", + " ORDER BY ABS(\n", + " CASE\n", + " WHEN a.\"chrom\" <> b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < b.\"end\" AND a.\"end\" > b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= b.\"start\"\n", + " THEN (\n", + " b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE -(\n", + " a.\"start\" - b.\"end\"\n", + " )\n", + " END\n", + " )\n", + " ) AS rank\n", + " FROM features_a AS a\n", + " CROSS JOIN features_b AS b\n", + " WHERE\n", + " a.chrom = b.chrom AND a.chrom = 'chr1' AND a.start < 500000\n", + ")\n", "SELECT\n", - " chromosome,\n", + " chrom,\n", " a_start,\n", " a_end,\n", " b_start,\n", " b_end,\n", " signed_distance,\n", " CASE\n", - " WHEN signed_distance < 0 THEN 'upstream'\n", - " WHEN signed_distance > 0 THEN 'downstream'\n", - " ELSE 'overlap'\n", + " WHEN signed_distance < 0\n", + " THEN 'upstream'\n", + " WHEN signed_distance > 0\n", + " THEN 'downstream'\n", + " ELSE 'overlap'\n", " END AS direction\n", "FROM distances\n", - "WHERE rank = 1\n", + "WHERE\n", + " rank = 1\n", "ORDER BY\n", - " chromosome,\n", + " chrom,\n", " a_start\n", "\n", "================================================================================\n", @@ -2343,7 +2584,7 @@ " \n", " \n", " \n", - " chromosome\n", + " chrom\n", " a_start\n", " a_end\n", " b_start\n", @@ -2388,10 +2629,10 @@ "" ], "text/plain": [ - " chromosome a_start a_end b_start b_end signed_distance direction\n", - "0 chr1 181368 181564 186654 186964 5090 downstream\n", - "1 chr1 186650 186846 186654 186964 0 overlap\n", - "2 chr1 267909 268105 267979 268128 0 overlap" + " chrom a_start a_end b_start b_end signed_distance direction\n", + "0 chr1 181368 181564 186654 186964 5090 downstream\n", + "1 chr1 186650 186846 186654 186964 0 overlap\n", + "2 chr1 267909 268105 267979 268128 0 overlap" ] }, "execution_count": 13, @@ -2400,58 +2641,48 @@ } ], "source": [ - "# Define GIQL query with signed distance\n", - "# Signed distance (signed=true) returns negative values for upstream features and positive for downstream\n", - "# This is similar to bedtools closest -D ref\n", "giql_query = \"\"\"\n", " WITH distances AS (\n", " SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " DISTANCE(a.interval, b.interval, signed=true) AS signed_distance,\n", " ROW_NUMBER() OVER (\n", - " PARTITION BY a.chromosome, a.start_pos, a.end_pos\n", + " PARTITION BY a.chrom, a.start, a.\"end\"\n", " ORDER BY ABS(DISTANCE(a.interval, b.interval, signed=true))\n", " ) AS rank\n", " FROM features_a a\n", " CROSS JOIN features_b b\n", - " WHERE a.chromosome = b.chromosome\n", - " AND a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", + " WHERE a.chrom = b.chrom\n", + " AND a.chrom = 'chr1'\n", + " AND a.start < 500000\n", " )\n", - " SELECT \n", - " chromosome,\n", + " SELECT\n", + " chrom,\n", " a_start,\n", " a_end,\n", " b_start,\n", " b_end,\n", " signed_distance,\n", - " CASE \n", + " CASE\n", " WHEN signed_distance < 0 THEN 'upstream'\n", " WHEN signed_distance > 0 THEN 'downstream'\n", " ELSE 'overlap'\n", " END AS direction\n", " FROM distances\n", " WHERE rank = 1\n", - " ORDER BY chromosome, a_start\n", + " ORDER BY chrom, a_start\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} features with directional distances\")\n", "print(f\"Upstream features: {(result['signed_distance'] < 0).sum()}\")\n", "print(f\"Downstream features: {(result['signed_distance'] > 0).sum()}\")\n", @@ -2463,9 +2694,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Signed Distance (Directional)\n", + "---\n", "\n", - "Calculate **directional distances** where negative values indicate upstream features and positive values indicate downstream features, similar to `bedtools closest -D ref`." + "## 7. NEAREST operator\n", + "\n", + "**Find the k-nearest genomic features** using the NEAREST operator.\n", + "\n", + "Similar to `bedtools closest`, this finds the closest features to query intervals. The NEAREST operator eliminates the need to write complex window functions for k-nearest neighbor queries.\n", + "\n", + "**Key Features**:\n", + "- Find k-nearest features (k=1 for closest, k>1 for multiple neighbors)\n", + "- Directional queries with `signed=true` (upstream vs downstream)\n", + "- Distance-constrained queries with `max_distance`\n", + "- Strand-specific queries with `stranded=true`\n", + "- Implicit reference resolution in LATERAL joins (automatically uses outer table's interval)\n", + "\n", + "**Database Support**:\n", + "- **DuckDB**: Full support (LATERAL joins)\n", + "- **SQLite**: Standalone mode only (no LATERAL support)\n", + "\n", + "**GIQL Query**: `SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)`" ] }, { @@ -2479,28 +2727,33 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " CASE\n", - " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < b.\"end_pos\"\n", - " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", + " WHEN a.\"chrom\" <> b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < b.\"end\" AND a.\"end\" > b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= b.\"start\"\n", + " THEN (\n", + " b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE (\n", + " a.\"start\" - b.\"end\"\n", + " )\n", " END AS distance\n", - "FROM\n", - " features_a AS a,\n", - " features_b AS b\n", - "WHERE a.chromosome = b.chromosome\n", - " AND a.chromosome = 'chr1'\n", - " AND a.start_pos BETWEEN 180000 AND 190000\n", - " AND b.start_pos BETWEEN 180000 AND 200000\n", + "FROM features_a AS a, features_b AS b\n", + "WHERE\n", + " a.chrom = b.chrom\n", + " AND a.chrom = 'chr1'\n", + " AND a.start BETWEEN 180000 AND 190000\n", + " AND b.start BETWEEN 180000 AND 200000\n", "ORDER BY\n", - " a.start_pos,\n", - " b.start_pos\n", + " a.start,\n", + " b.start\n", "\n", "================================================================================\n", "\n", @@ -2530,7 +2783,7 @@ " \n", " \n", " \n", - " chromosome\n", + " chrom\n", " a_start\n", " a_end\n", " b_start\n", @@ -2562,9 +2815,9 @@ "" ], "text/plain": [ - " chromosome a_start a_end b_start b_end distance\n", - "0 chr1 181368 181564 186654 186964 5090\n", - "1 chr1 186650 186846 186654 186964 0" + " chrom a_start a_end b_start b_end distance\n", + "0 chr1 181368 181564 186654 186964 5090\n", + "1 chr1 186650 186846 186654 186964 0" ] }, "execution_count": 14, @@ -2573,38 +2826,28 @@ } ], "source": [ - "# Define GIQL query - basic distance calculation\n", - "# DISTANCE() returns 0 for overlapping intervals, positive integers for gaps between intervals\n", - "# Filters to a small region on chr1 for demonstration\n", "giql_query = \"\"\"\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " SELECT\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " DISTANCE(a.interval, b.interval) AS distance\n", " FROM features_a a, features_b b\n", - " WHERE a.chromosome = b.chromosome \n", - " AND a.chromosome = 'chr1'\n", - " AND a.start_pos BETWEEN 180000 AND 190000\n", - " AND b.start_pos BETWEEN 180000 AND 200000\n", - " ORDER BY a.start_pos, b.start_pos\n", + " WHERE a.chrom = b.chrom\n", + " AND a.chrom = 'chr1'\n", + " AND a.start BETWEEN 180000 AND 190000\n", + " AND b.start BETWEEN 180000 AND 200000\n", + " ORDER BY a.start, b.start\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} distance calculations\")\n", "print(\"\\nExample distances (0 = overlap, positive = gap in base pairs):\")\n", "result.head(10)" @@ -2614,26 +2857,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - "\n", - "## 7. NEAREST operator\n", - "\n", - "**Find the k-nearest genomic features** using the NEAREST operator.\n", - "\n", - "Similar to `bedtools closest`, this finds the closest features to query intervals. The NEAREST operator eliminates the need to write complex window functions for k-nearest neighbor queries.\n", - "\n", - "**Key Features**:\n", - "- Find k-nearest features (k=1 for closest, k>1 for multiple neighbors)\n", - "- Directional queries with `signed=true` (upstream vs downstream)\n", - "- Distance-constrained queries with `max_distance`\n", - "- Strand-specific queries with `stranded=true`\n", - "- Implicit reference resolution in LATERAL joins (automatically uses outer table's interval)\n", - "\n", - "**Database Support**:\n", - "- **DuckDB**: Full support (LATERAL joins)\n", - "- **SQLite**: Standalone mode only (no LATERAL support)\n", + "### Finding k-Nearest Features\n", "\n", - "**GIQL Query**: `SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)`" + "Find the **3 closest features** in B for each feature in A using `k=3`." ] }, { @@ -2647,41 +2873,56 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", " a.signal_value AS a_signal,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " b.signal_value AS b_signal,\n", " distance\n", - "FROM\n", - " features_a AS a,\n", - "\n", - " (SELECT\n", - " features_b.*,\n", - " CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END AS distance\n", - " FROM features_b\n", - " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", - " ORDER BY ABS(CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END)\n", - " LIMIT 1) AS b\n", - "WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 1000000\n", + "FROM features_a AS a, (\n", + " SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE (\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE\n", + " a.\"chrom\" = features_b.\"chrom\"\n", + " ORDER BY\n", + " ABS(\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE (\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END\n", + " )\n", + " LIMIT 1\n", + ") AS b\n", + "WHERE\n", + " a.chrom = 'chr1' AND a.start < 1000000\n", "ORDER BY\n", - " a.chromosome,\n", - " a.start_pos\n", + " a.chrom,\n", + " a.start\n", "\n", "================================================================================\n", "\n", @@ -2711,7 +2952,7 @@ " \n", " \n", " \n", - " chromosome\n", + " chrom\n", " a_start\n", " a_end\n", " a_signal\n", @@ -2837,17 +3078,17 @@ "" ], "text/plain": [ - " chromosome a_start a_end a_signal b_start b_end b_signal distance\n", - "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", - "1 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", - "2 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", - "3 chr1 586106 586302 35.02027 850424 850734 15.07029 264122\n", - "4 chr1 729261 729457 23.29415 850424 850734 15.07029 120967\n", - "5 chr1 778812 779008 56.97663 850424 850734 15.07029 71416\n", - "6 chr1 850473 850669 27.08243 850424 850734 15.07029 0\n", - "7 chr1 858056 858252 15.55869 850424 850734 15.07029 7322\n", - "8 chr1 869860 869991 168.87294 869711 870021 12.42156 0\n", - "9 chr1 904689 904883 167.56897 912864 913174 31.07865 7981" + " chrom a_start a_end a_signal b_start b_end b_signal distance\n", + "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", + "1 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", + "2 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", + "3 chr1 586106 586302 35.02027 850424 850734 15.07029 264122\n", + "4 chr1 729261 729457 23.29415 850424 850734 15.07029 120967\n", + "5 chr1 778812 779008 56.97663 850424 850734 15.07029 71416\n", + "6 chr1 850473 850669 27.08243 850424 850734 15.07029 0\n", + "7 chr1 858056 858252 15.55869 850424 850734 15.07029 7322\n", + "8 chr1 869860 869991 168.87294 869711 870021 12.42156 0\n", + "9 chr1 904689 904883 167.56897 912864 913174 31.07865 7981" ] }, "execution_count": 15, @@ -2856,38 +3097,28 @@ } ], "source": [ - "# Define GIQL query - find closest feature in B for each feature in A\n", - "# This replicates bedtools closest -d functionality using NEAREST(target, k=1)\n", - "# The reference interval is automatically inferred from the outer table (features_a)\n", "giql_query = \"\"\"\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", + " SELECT\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", " a.signal_value AS a_signal,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " b.signal_value AS b_signal,\n", " distance\n", " FROM features_a a, NEAREST(features_b, k=1) b\n", - " WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 1000000\n", - " ORDER BY a.chromosome, a.start_pos\n", + " WHERE a.chrom = 'chr1'\n", + " AND a.start < 1000000\n", + " ORDER BY a.chrom, a.start\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "print(f\"Result: {len(result)} features from A with their closest feature in B\")\n", "print(f\"Average distance to closest feature: {result['distance'].mean():.1f} bp\")\n", "print(f\"Features overlapping with closest: {(result['distance'] == 0).sum()}\")\n", @@ -2898,9 +3129,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Finding k-Nearest Features\n", + "### Directional Queries with signed=true\n", "\n", - "Find the **3 closest features** in B for each feature in A using `k=3`." + "Find the **3 nearest upstream or downstream features** using `signed=true`.\n", + "\n", + "Negative distances indicate upstream features (B is before A), positive distances indicate downstream features (B is after A)." ] }, { @@ -2914,41 +3147,56 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", " a.signal_value AS a_signal,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " b.signal_value AS b_signal,\n", " distance\n", - "FROM\n", - " features_a AS a,\n", - "\n", - " (SELECT\n", - " features_b.*,\n", - " CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END AS distance\n", - " FROM features_b\n", - " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", - " ORDER BY ABS(CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END)\n", - " LIMIT 3) AS b\n", - "WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", + "FROM features_a AS a, (\n", + " SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE (\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE\n", + " a.\"chrom\" = features_b.\"chrom\"\n", + " ORDER BY\n", + " ABS(\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE (\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END\n", + " )\n", + " LIMIT 3\n", + ") AS b\n", + "WHERE\n", + " a.chrom = 'chr1' AND a.start < 500000\n", "ORDER BY\n", - " a.chromosome,\n", - " a.start_pos,\n", + " a.chrom,\n", + " a.start,\n", " distance\n", "\n", "================================================================================\n", @@ -2979,7 +3227,7 @@ " \n", " \n", " \n", - " chromosome\n", + " chrom\n", " a_start\n", " a_end\n", " a_signal\n", @@ -3094,16 +3342,16 @@ "" ], "text/plain": [ - " chromosome a_start a_end a_signal b_start b_end b_signal distance\n", - "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", - "1 chr1 181368 181564 27.54796 267979 268128 49.39366 86415\n", - "2 chr1 181368 181564 27.54796 850424 850734 15.07029 668860\n", - "3 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", - "4 chr1 186650 186846 19.93098 267979 268128 49.39366 81133\n", - "5 chr1 186650 186846 19.93098 850424 850734 15.07029 663578\n", - "6 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", - "7 chr1 267909 268105 88.00863 186654 186964 14.61908 80945\n", - "8 chr1 267909 268105 88.00863 850424 850734 15.07029 582319" + " chrom a_start a_end a_signal b_start b_end b_signal distance\n", + "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", + "1 chr1 181368 181564 27.54796 267979 268128 49.39366 86415\n", + "2 chr1 181368 181564 27.54796 850424 850734 15.07029 668860\n", + "3 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", + "4 chr1 186650 186846 19.93098 267979 268128 49.39366 81133\n", + "5 chr1 186650 186846 19.93098 850424 850734 15.07029 663578\n", + "6 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", + "7 chr1 267909 268105 88.00863 186654 186964 14.61908 80945\n", + "8 chr1 267909 268105 88.00863 850424 850734 15.07029 582319" ] }, "execution_count": 16, @@ -3112,54 +3360,45 @@ } ], "source": [ - "# Define GIQL query - find 3 nearest features in B for each feature in A\n", "giql_query = \"\"\"\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", + " SELECT\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", " a.signal_value AS a_signal,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " b.signal_value AS b_signal,\n", " distance\n", " FROM features_a a, NEAREST(features_b, k=3) b\n", - " WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", - " ORDER BY a.chromosome, a.start_pos, distance\n", + " WHERE a.chrom = 'chr1'\n", + " AND a.start < 500000\n", + " ORDER BY a.chrom, a.start, distance\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "\n", - "# Group by feature A to show k neighbors per feature\n", "grouped = result.groupby([\"a_start\", \"a_end\"]).size()\n", "print(f\"Result: {len(result)} total rows (up to 3 neighbors per feature in A)\")\n", "print(f\"Number of features from A: {len(grouped)}\")\n", "print(f\"Average neighbors per feature: {grouped.mean():.1f}\")\n", - "result.head(15) # Show first 5 features with their 3 neighbors each" + "result.head(15)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Directional Queries with signed=true\n", + "### Distance-Constrained Queries with max_distance\n", "\n", - "Find the **3 nearest upstream or downstream features** using `signed=true`.\n", + "Find up to **5 nearest features within 50kb** using `max_distance=50000`.\n", "\n", - "Negative distances indicate upstream features (B is before A), positive distances indicate downstream features (B is after A)." + "This is useful for finding nearby regulatory elements or associated genes within a biologically relevant distance." ] }, { @@ -3173,51 +3412,67 @@ "text": [ "Transpiled SQL:\n", "SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " distance,\n", " CASE\n", - " WHEN distance < 0 THEN 'upstream'\n", - " WHEN distance > 0 THEN 'downstream'\n", - " ELSE 'overlap'\n", + " WHEN distance < 0\n", + " THEN 'upstream'\n", + " WHEN distance > 0\n", + " THEN 'downstream'\n", + " ELSE 'overlap'\n", " END AS direction\n", - "FROM\n", - " features_a AS a,\n", - "\n", - " (SELECT\n", - " features_b.*,\n", - " CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END AS distance\n", - " FROM features_b\n", - " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", - " ORDER BY ABS(CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END)\n", - " LIMIT 3) AS b\n", - "WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", - " AND direction = 'upstream'\n", + "FROM features_a AS a, (\n", + " SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE -(\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE\n", + " a.\"chrom\" = features_b.\"chrom\"\n", + " ORDER BY\n", + " ABS(\n", + " CASE\n", + " WHEN a.\"chrom\" <> features_b.\"chrom\"\n", + " THEN NULL\n", + " WHEN a.\"start\" < features_b.\"end\" AND a.\"end\" > features_b.\"start\"\n", + " THEN 0\n", + " WHEN a.\"end\" <= features_b.\"start\"\n", + " THEN (\n", + " features_b.\"start\" - a.\"end\"\n", + " )\n", + " ELSE -(\n", + " a.\"start\" - features_b.\"end\"\n", + " )\n", + " END\n", + " )\n", + " LIMIT 3\n", + ") AS b\n", + "WHERE\n", + " a.chrom = 'chr1' AND a.start < 500000 AND direction = 'upstream'\n", "ORDER BY\n", - " a.chromosome,\n", - " a.start_pos,\n", + " a.chrom,\n", + " a.start,\n", " ABS(distance)\n", "\n", "================================================================================\n", "\n", - "Result: 0 total nearest features\n", - "Upstream features (distance < 0): 0\n", + "Result: 1 total nearest features\n", + "Upstream features (distance < 0): 1\n", "Downstream features (distance > 0): 0\n", "Overlapping features (distance = 0): 0\n" ] @@ -3243,7 +3498,7 @@ " \n", " \n", " \n", - " chromosome\n", + " chrom\n", " a_start\n", " a_end\n", " b_start\n", @@ -3253,14 +3508,23 @@ " \n", " \n", " \n", + " \n", + " 0\n", + " chr1\n", + " 267909\n", + " 268105\n", + " 186654\n", + " 186964\n", + " -80945\n", + " upstream\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [chromosome, a_start, a_end, b_start, b_end, distance, direction]\n", - "Index: []" + " chrom a_start a_end b_start b_end distance direction\n", + "0 chr1 267909 268105 186654 186964 -80945 upstream" ] }, "execution_count": 17, @@ -3269,15 +3533,13 @@ } ], "source": [ - "# Define GIQL query - find 3 nearest features with directional distances\n", - "# signed=true returns negative for upstream (B before A) and positive for downstream (B after A)\n", "giql_query = \"\"\"\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", + " SELECT\n", + " a.chrom,\n", + " a.start AS a_start,\n", + " a.\"end\" AS a_end,\n", + " b.start AS b_start,\n", + " b.\"end\" AS b_end,\n", " distance,\n", " CASE\n", " WHEN distance < 0 THEN 'upstream'\n", @@ -3285,27 +3547,19 @@ " ELSE 'overlap'\n", " END AS direction\n", " FROM features_a a, NEAREST(features_b, k=3, signed=true) b\n", - " WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", + " WHERE a.chrom = 'chr1'\n", + " AND a.start < 500000\n", " AND direction = 'upstream'\n", - " ORDER BY a.chromosome, a.start_pos, ABS(distance)\n", + " ORDER BY a.chrom, a.start, ABS(distance)\n", "\"\"\"\n", "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", + "sql = transpile(giql_query, tables=[\"features_a\", \"features_b\"])\n", "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", + "print(sqlglot.transpile(sql, pretty=True)[0])\n", "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", + "result = conn.execute(sql).fetchdf()\n", "\n", - "# Analyze directionality\n", "upstream = (result[\"distance\"] < 0).sum()\n", "downstream = (result[\"distance\"] > 0).sum()\n", "overlap = (result[\"distance\"] == 0).sum()\n", @@ -3317,315 +3571,6 @@ "result.head(15)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Distance-Constrained Queries with max_distance\n", - "\n", - "Find up to **5 nearest features within 50kb** using `max_distance=50000`.\n", - "\n", - "This is useful for finding nearby regulatory elements or associated genes within a biologically relevant distance." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Transpiled SQL:\n", - "SELECT\n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", - " distance\n", - "FROM\n", - " features_a AS a,\n", - "\n", - " (SELECT\n", - " features_b.*,\n", - " CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END AS distance\n", - " FROM features_b\n", - " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", - " AND (ABS(CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END)) <= 1000000\n", - " ORDER BY ABS(CASE\n", - " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", - " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", - " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", - " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", - " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", - " END)\n", - " LIMIT 5) AS b\n", - "WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", - "ORDER BY\n", - " a.chromosome,\n", - " a.start_pos,\n", - " distance\n", - "\n", - "================================================================================\n", - "\n", - "Result: 15 total rows (up to 5 neighbors within 50kb per feature in A)\n", - "Number of features from A: 3\n", - "Average neighbors per feature: 5.0\n", - "Max distance found: 731300 bp (should be <= 50000)\n", - "Features with at least one neighbor: 3\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chromosomea_starta_endb_startb_enddistance
0chr11813681815641866541869645090
1chr118136818156426797926812886415
2chr1181368181564850424850734668860
3chr1181368181564869711870021688147
4chr1181368181564912864913174731300
5chr11866501868461866541869640
6chr118665018684626797926812881133
7chr1186650186846850424850734663578
8chr1186650186846869711870021682865
9chr1186650186846912864913174726018
10chr12679092681052679792681280
11chr126790926810518665418696480945
12chr1267909268105850424850734582319
13chr1267909268105869711870021601606
14chr1267909268105912864913174644759
\n", - "
" - ], - "text/plain": [ - " chromosome a_start a_end b_start b_end distance\n", - "0 chr1 181368 181564 186654 186964 5090\n", - "1 chr1 181368 181564 267979 268128 86415\n", - "2 chr1 181368 181564 850424 850734 668860\n", - "3 chr1 181368 181564 869711 870021 688147\n", - "4 chr1 181368 181564 912864 913174 731300\n", - "5 chr1 186650 186846 186654 186964 0\n", - "6 chr1 186650 186846 267979 268128 81133\n", - "7 chr1 186650 186846 850424 850734 663578\n", - "8 chr1 186650 186846 869711 870021 682865\n", - "9 chr1 186650 186846 912864 913174 726018\n", - "10 chr1 267909 268105 267979 268128 0\n", - "11 chr1 267909 268105 186654 186964 80945\n", - "12 chr1 267909 268105 850424 850734 582319\n", - "13 chr1 267909 268105 869711 870021 601606\n", - "14 chr1 267909 268105 912864 913174 644759" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Define GIQL query - find up to 5 nearest features within 50kb\n", - "# max_distance=1000000 filters out features farther than 50kb\n", - "giql_query = \"\"\"\n", - " SELECT \n", - " a.chromosome,\n", - " a.start_pos AS a_start,\n", - " a.end_pos AS a_end,\n", - " b.start_pos AS b_start,\n", - " b.end_pos AS b_end,\n", - " distance\n", - " FROM features_a a, NEAREST(features_b, k=5, max_distance=1000000) b\n", - " WHERE a.chromosome = 'chr1'\n", - " AND a.start_pos < 500000\n", - " ORDER BY a.chromosome, a.start_pos, distance\n", - "\"\"\"\n", - "\n", - "# Transpile to SQL\n", - "sql = engine.transpile(giql_query)\n", - "print(\"Transpiled SQL:\")\n", - "print(\n", - " sqlparse.format(\n", - " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", - " )\n", - ")\n", - "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", - "\n", - "# Execute with DuckDB via GIQL engine\n", - "cursor = engine.conn.execute(sql)\n", - "result = cursor.df() # Get result as pandas DataFrame\n", - "\n", - "# Analyze distance constraints\n", - "grouped = result.groupby([\"a_start\", \"a_end\"]).size()\n", - "print(\n", - " f\"Result: {len(result)} total rows (up to 5 neighbors within 50kb per feature in A)\"\n", - ")\n", - "print(f\"Number of features from A: {len(grouped)}\")\n", - "print(f\"Average neighbors per feature: {grouped.mean():.1f}\")\n", - "print(f\"Max distance found: {result['distance'].max()} bp (should be <= 50000)\")\n", - "print(f\"Features with at least one neighbor: {len(grouped)}\")\n", - "result.head(15)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -3651,79 +3596,6 @@ "\n", "This demonstrates how GIQL provides a high-level, intuitive syntax for genomic interval operations while maintaining compatibility with standard SQL engines through transpilation." ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "# Clean up\n", - "# engine.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "9q2y97xvfyj", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pandas dtypes:\n", - "chromosome str\n", - "start_pos int64\n", - "end_pos int64\n", - "name str\n", - "score int64\n", - "strand str\n", - "signal_value float64\n", - "p_value float64\n", - "q_value float64\n", - "peak int64\n", - "dtype: object\n", - "\n", - "First few rows:\n", - " chromosome start_pos end_pos name score strand signal_value p_value \\\n", - "0 chr17 27381032 27381306 . 1000 . 461.10586 -1.0 \n", - "1 chr7 25565932 25566199 . 1000 . 457.60096 -1.0 \n", - "\n", - " q_value peak \n", - "0 5.08726 136 \n", - "1 5.08726 133 \n" - ] - } - ], - "source": [ - "import polars as pl\n", - "\n", - "# Load first BED file to check dtypes\n", - "bed_schema = {\n", - " \"chromosome\": pl.Utf8,\n", - " \"start_pos\": pl.Int64,\n", - " \"end_pos\": pl.Int64,\n", - " \"name\": pl.Utf8,\n", - " \"score\": pl.Int64,\n", - " \"strand\": pl.Utf8,\n", - " \"signal_value\": pl.Float64,\n", - " \"p_value\": pl.Float64,\n", - " \"q_value\": pl.Float64,\n", - " \"peak\": pl.Int64,\n", - "}\n", - "\n", - "features_a = pl.read_csv(\n", - " \".ENCFF199YFA.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n", - ")\n", - "\n", - "# Check pandas conversion\n", - "adf = features_a.to_pandas()\n", - "print(\"Pandas dtypes:\")\n", - "print(adf.dtypes)\n", - "print(\"\\nFirst few rows:\")\n", - "print(adf.head(2))" - ] } ], "metadata": { @@ -3742,7 +3614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.3" + "version": "3.13.2" } }, "nbformat": 4, From d894f9de82229c358099e6e18cb2b3afd9e6e7db Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 13 Feb 2026 00:02:57 -0500 Subject: [PATCH 3/3] Add repo button to docs --- docs/conf.py | 8 ++++++++ docs/index.rst | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 9a28ad8..5e28805 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -72,3 +72,11 @@ html_theme = "sphinx_book_theme" # html_static_path = ['_static'] # Uncomment when you have custom static files +html_theme_options = { + # "logo": { + # "image_light": "_static/logo-light.svg", + # "image_dark": "_static/logo-dark.svg", + # }, + "repository_url": "https://github.com/abdenlab/giql", + "use_repository_button": True, +} \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 83b2834..91c98d9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,16 +4,16 @@ Genomic Interval Query Language (GIQL) .. toctree:: :hidden: - Home + GIQL quickstart -**GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations. +**GIQL** (pronounced "JEE-quel", rhymes with "sequel") is an extended SQL dialect that allows you to declaratively express genomic interval operations. See the :doc:`quickstart` to get started. Dialect ------- -GIQL extends the SQL query language with dedicated constructs for these +GIQL extends the SQL query language with dedicated constructs for common tasks, allowing you to declare *what* you want to compute rather than how. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL