diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
new file mode 100644
index 0000000..99dea36
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,25 @@
+name: Bug Report
+description: Report something that isn't working correctly.
+labels: ["bug"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What is the bug? Include steps to reproduce if applicable.
+ validations:
+ required: true
+ - type: textarea
+ id: root-cause
+ attributes:
+ label: Root cause
+ description: What is causing the bug? Include relevant code snippets.
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which files, modules, or components are affected?
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/build.yaml b/.github/ISSUE_TEMPLATE/build.yaml
new file mode 100644
index 0000000..6b6490a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/build.yaml
@@ -0,0 +1,25 @@
+name: Build
+description: Propose a build system or dependency change.
+labels: ["build"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What build system or dependency change is needed?
+ validations:
+ required: true
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Why is this change needed?
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which build files, configs, or dependencies are affected?
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/cicd.yaml b/.github/ISSUE_TEMPLATE/cicd.yaml
new file mode 100644
index 0000000..277c550
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/cicd.yaml
@@ -0,0 +1,25 @@
+name: CI/CD
+description: Propose a CI/CD pipeline change.
+labels: ["cicd"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What CI/CD change is needed?
+ validations:
+ required: true
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Why is this change needed? What does it improve?
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which workflows, pipelines, or config files are affected?
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000..3ba13e0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml
new file mode 100644
index 0000000..d444640
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
@@ -0,0 +1,25 @@
+name: Feature Request
+description: Propose a new feature or capability.
+labels: ["feature"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What is the feature? Describe the desired behavior.
+ validations:
+ required: true
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Why is this feature needed? What problem does it solve?
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which files, modules, or components would be affected?
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/refactor.yaml b/.github/ISSUE_TEMPLATE/refactor.yaml
new file mode 100644
index 0000000..3ab2bfc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/refactor.yaml
@@ -0,0 +1,25 @@
+name: Refactor
+description: Propose a code restructuring without behavior change.
+labels: ["refactor"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What should be restructured and what does the end state look like?
+ validations:
+ required: true
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Why is this restructuring needed?
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which files, modules, or components are affected?
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/test.yaml b/.github/ISSUE_TEMPLATE/test.yaml
new file mode 100644
index 0000000..c417347
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/test.yaml
@@ -0,0 +1,25 @@
+name: Test
+description: Add or improve test coverage or test infrastructure.
+labels: ["test"]
+body:
+ - type: textarea
+ id: summary
+ attributes:
+ label: Summary
+ description: What needs to be tested or what test infrastructure is needed?
+ validations:
+ required: true
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Why is this test work needed? What gap does it fill?
+ validations:
+ required: true
+ - type: textarea
+ id: affected-code
+ attributes:
+ label: Affected code
+ description: Which files, modules, or components are affected?
+ validations:
+ required: false
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 80eac40..a2caa3d 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,30 +6,20 @@ version: 2
# Set the OS, Python version and other tools
build:
- os: "ubuntu-22.04"
+ os: "ubuntu-24.04"
tools:
python: "3.11"
+ jobs:
+ post_install:
+ - python -m pip install --group docs
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
-formats:
- - epub
-
-# Specify the Python requirements file
+# Install the project
python:
install:
- - requirements: docs/requirements.txt
- method: pip
path: .
-# Version management:
-# Documentation is immutable per release. Each git tag (v0.1.0, v0.2.0, etc.) produces
-# a frozen snapshot of the docs as they were at that release.
-#
-# - Tags (v0.1.0, v0.2.0, ...): Immutable doc snapshots
-# - main/master branches: Development version docs (always latest)
-#
-# This ensures historical accuracy: users viewing old version docs see them exactly
-# as they were released, not retrospectively updated.
diff --git a/README.md b/README.md
index 3d84ec7..b51586d 100644
--- a/README.md
+++ b/README.md
@@ -1,154 +1,113 @@
-# GIQL - Genomic Interval Query Language
+# GIQL
-A SQL dialect for genomic range queries with multi-database support.
+
Genomic Interval Query Language (GIQL)
+/JEE-quel/
+
+
+ docs |
+ syntax |
+ transpiler
+
+
-## Overview
+GIQL is an extended SQL dialect that allows you to declaratively express genomic interval operations.
-GIQL extends SQL with spatial operators for genomic interval queries. It transpiles to standard SQL that works across multiple database backends including DuckDB and SQLite.
+The `giql` Python package transpiles GIQL queries into standard SQL syntax for execution on any database or analytics engine.
-GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable across databases.
-
-## Features
-
-- **SQL-based**: Familiar SQL syntax with genomic extensions
-- **Multi-backend**: Works with DuckDB, SQLite, and more
-- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships
-- **Distance operators**: DISTANCE, NEAREST for proximity queries
-- **Aggregation operators**: CLUSTER, MERGE for combining intervals
-- **Set quantifiers**: ANY, ALL for multi-range queries
-- **Transpilation**: Convert GIQL to standard SQL for debugging or external use
+> **Note:** This project is in active development — APIs, syntax, and behavior may change.
## Installation
-### From PyPI
-
-Install the latest stable release:
+To install the transpiler:
```bash
pip install giql
```
-Or the latest release candidate:
-
-```bash
-pip install --pre giql
-```
-
-### From Source
+## Usage (transpilation)
-Clone the repository and install locally:
+The `giql` package transpiles GIQL queries to standard SQL.
-```bash
-# Clone the repository
-git clone https://github.com/abdenlab/giql.git
-cd giql
-
-# Install in development mode
-pip install -e .
+```python
+from giql import transpile
-# Or with development dependencies
-pip install -e ".[dev]"
+sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["peaks"],
+)
+print(sql)
```
-### Building Documentation
-
-To build the documentation locally:
-
-```bash
-cd docs
-
-# Install documentation dependencies
-pip install -r requirements.txt
-
-# Build HTML documentation
-make html
-
-# View the documentation
-# The built docs will be in docs/_build/html/
-# Open docs/_build/html/index.html in your browser
+```sql
+SELECT
+ *
+FROM peaks
+WHERE
+ (
+ "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000
+ )
```
-## Quick Start
+Each table referenced in a GIQL query exposes a genomic "pseudo-column" that maps to separate logical chromosome, start, end, and strand columns. You can customize the column mappings.
```python
-from giql import GIQLEngine
-
-# Create engine with DuckDB backend
-with GIQLEngine(target_dialect="duckdb") as engine:
- # Load genomic data
- engine.load_csv("variants", "variants.csv")
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- },
- genomic_column="interval",
- )
-
- # Query with genomic operators (returns cursor for streaming)
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Process results lazily
- for row in cursor:
- print(row)
-
- # Or just transpile to SQL without executing
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
- print(sql) # See the generated SQL
+from giql import Table, transpile
+
+sql = transpile(
+ "SELECT * FROM variants WHERE position INTERSECTS 'chr1:1000-2000'",
+ tables=[
+ Table(
+ "variants",
+ genomic_col="position",
+ chrom_col="chromosome",
+ start_col="start_pos",
+ end_col="end_pos",
+ )
+ ],
+)
+print(sql)
```
-## Operators at a Glance
-
-### Spatial Relationships
-
-| Operator | Description |
-|----------|-------------|
-| `INTERSECTS` | Returns true when ranges overlap by at least one base pair |
-| `CONTAINS` | Returns true when one range fully contains another |
-| `WITHIN` | Returns true when one range is fully within another |
+The transpiled SQL can be executed with fast genome-unaware databases or in-memory analytic engines like DuckDB.
-### Distance and Proximity
+You can also use [oxbow](https://oxbow.readthedocs.io) to efficiently stream specialized genomics formats into DuckDB.
-| Operator | Description |
-|----------|-------------|
-| `DISTANCE` | Calculate genomic distance between two intervals |
-| `NEAREST` | Find k-nearest genomic features |
-
-### Aggregation
+```python
+import duckdb
+import oxbow as ox
+from giql import transpile
-| Operator | Description |
-|----------|-------------|
-| `CLUSTER` | Assign cluster IDs to overlapping intervals |
-| `MERGE` | Combine overlapping intervals into unified regions |
+conn = duckdb.connect()
-### Set Quantifiers
+# Load a streaming data source as a DuckDB relation
+peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn)
-| Quantifier | Description |
-|------------|-------------|
-| `ANY` | Match if condition holds for any of the specified ranges |
-| `ALL` | Match if condition holds for all of the specified ranges |
+sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["peaks"],
+)
-## Documentation
+# Execute and return the output as a dataframe
+df = con.execute(sql).fetchdf()
+```
-For complete documentation, build the docs locally (see above) or visit the hosted documentation.
+## Development
-The documentation includes:
+```bash
+git clone https://github.com/abdenlab/giql.git
+cd giql
+uv sync
+```
-- **Operator Reference**: Detailed documentation for each operator with examples
-- **Recipes**: Common query patterns for intersections, distance calculations, and clustering
-- **Bedtools Migration Guide**: How to replicate bedtools operations with GIQL
-- **Guides**: Performance optimization, multi-backend configuration, and schema mapping
+To build the documentation locally:
-## Development
+```bash
+uv run --group docs sphinx-build docs docs/_build
+# The built docs will be in docs/_build/html/
+```
-This project is in active development.
+For serve the docs locally with automatic rebuild:
+```bash
+uv run --group docs sphinx-autobuild docs docs/_build
+```
diff --git a/docs/api/index.rst b/docs/api/index.rst
deleted file mode 100644
index a17dc9e..0000000
--- a/docs/api/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-API Reference
-=============
-
-This section documents the GIQL Python API.
-
-.. toctree::
- :maxdepth: 2
-
-.. automodule:: giql
- :members:
- :undoc-members:
- :show-inheritance:
diff --git a/docs/conf.py b/docs/conf.py
index 1d38676..9a28ad8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -25,6 +25,7 @@
"sphinx.ext.viewcode",
"sphinx.ext.intersphinx",
"sphinx.ext.autosummary",
+ "sphinx_design",
]
# Napoleon settings
@@ -69,5 +70,5 @@
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-html_theme = "sphinx_rtd_theme"
+html_theme = "sphinx_book_theme"
# html_static_path = ['_static'] # Uncomment when you have custom static files
diff --git a/docs/operators/aggregation-operators.rst b/docs/dialect/aggregation-operators.rst
similarity index 63%
rename from docs/operators/aggregation-operators.rst
rename to docs/dialect/aggregation-operators.rst
index 50d10da..cc3d5ec 100644
--- a/docs/operators/aggregation-operators.rst
+++ b/docs/dialect/aggregation-operators.rst
@@ -1,5 +1,5 @@
-Aggregation Operators
-=====================
+Aggregation
+===========
Aggregation operators combine and cluster genomic intervals. These operators are
essential for reducing complex interval data into summarized regions, such as
@@ -7,7 +7,7 @@ merging overlapping peaks or identifying clusters of related features.
.. contents::
:local:
- :depth: 2
+ :depth: 1
.. _cluster-operator:
@@ -51,7 +51,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**distance** *(optional)*
Maximum gap between intervals to consider them part of the same cluster.
@@ -73,91 +73,81 @@ Examples
Assign cluster IDs to overlapping intervals:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
**Distance-Based Clustering:**
Cluster intervals within 1000bp of each other:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, 1000) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, 1000) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
**Strand-Specific Clustering:**
Cluster intervals separately by strand:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, stranded=true) AS cluster_id
- FROM features
- ORDER BY chromosome, strand, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, stranded=true) AS cluster_id
+ FROM features
+ ORDER BY chrom, strand, start
**Analyze Cluster Statistics:**
Count features per cluster:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT
- *,
- CLUSTER(interval) AS cluster_id
- FROM features
- )
+ WITH clustered AS (
SELECT
- chromosome,
- cluster_id,
- COUNT(*) AS feature_count,
- MIN(start_pos) AS cluster_start,
- MAX(end_pos) AS cluster_end
- FROM clustered
- GROUP BY chromosome, cluster_id
- ORDER BY chromosome, cluster_start
- """)
+ *,
+ CLUSTER(interval) AS cluster_id
+ FROM features
+ )
+ SELECT
+ chrom,
+ cluster_id,
+ COUNT(*) AS feature_count,
+ MIN(start) AS cluster_start,
+ MAX(end) AS cluster_end
+ FROM clustered
+ GROUP BY chrom, cluster_id
+ ORDER BY chrom, cluster_start
**Filter by Cluster Size:**
Find regions with multiple overlapping features:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT
- *,
- CLUSTER(interval) AS cluster_id
- FROM features
- ),
- cluster_sizes AS (
- SELECT cluster_id, COUNT(*) AS size
- FROM clustered
- GROUP BY cluster_id
- )
- SELECT c.*
- FROM clustered c
- INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id
- WHERE s.size >= 3
- """)
+.. code-block:: sql
+
+ WITH clustered AS (
+ SELECT
+ *,
+ CLUSTER(interval) AS cluster_id
+ FROM features
+ ),
+ cluster_sizes AS (
+ SELECT cluster_id, COUNT(*) AS size
+ FROM clustered
+ GROUP BY cluster_id
+ )
+ SELECT c.*
+ FROM clustered c
+ INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id
+ WHERE s.size >= 3
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -239,7 +229,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**distance** *(optional)*
Maximum gap between intervals to merge. Default: ``0`` (only overlapping
@@ -253,9 +243,9 @@ Return Value
Returns merged interval coordinates:
-- ``chromosome`` - Chromosome of the merged region
-- ``start_pos`` - Start position of the merged region
-- ``end_pos`` - End position of the merged region
+- ``chrom`` - Chromosome of the merged region
+- ``start`` - Start position of the merged region
+- ``end`` - End position of the merged region
- ``strand`` - Strand (if ``stranded=true``)
Examples
@@ -265,108 +255,92 @@ Examples
Merge all overlapping intervals:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval)
- FROM features
- """)
+ SELECT MERGE(interval)
+ FROM features
- # Returns: chromosome, start_pos, end_pos for each merged region
+ -- Returns: chrom, start, end for each merged region
**Distance-Based Merge:**
Merge intervals within 1000bp of each other:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, 1000)
- FROM features
- """)
+ SELECT MERGE(interval, 1000)
+ FROM features
**Strand-Specific Merge:**
Merge intervals separately by strand:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, stranded=true)
- FROM features
- """)
+ SELECT MERGE(interval, stranded=true)
+ FROM features
**Merge with Feature Count:**
Count how many features were merged into each region:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- COUNT(*) AS feature_count
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ COUNT(*) AS feature_count
+ FROM features
**Merge with Aggregations:**
Calculate statistics for merged regions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- COUNT(*) AS feature_count,
- AVG(score) AS avg_score,
- MAX(score) AS max_score
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ COUNT(*) AS feature_count,
+ AVG(score) AS avg_score,
+ MAX(score) AS max_score
+ FROM features
**Collect Merged Feature Names:**
List the names of features that were merged:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- STRING_AGG(name, ',') AS feature_names
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ STRING_AGG(name, ',') AS feature_names
+ FROM features
**Merge by Chromosome:**
Process each chromosome separately (explicit grouping):
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- chromosome,
- MERGE(interval),
- COUNT(*) AS feature_count
- FROM features
- GROUP BY chromosome
- ORDER BY chromosome
- """)
+ SELECT
+ chrom,
+ MERGE(interval),
+ COUNT(*) AS feature_count
+ FROM features
+ GROUP BY chrom
+ ORDER BY chrom
**Calculate Total Coverage:**
Calculate the total base pairs covered after merging:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH merged AS (
- SELECT MERGE(interval) AS merged_pos
- FROM features
- )
- SELECT SUM(end_pos - start_pos) AS total_coverage
- FROM merged
- """)
+ WITH merged AS (
+ SELECT MERGE(interval) AS merged_pos
+ FROM features
+ )
+ SELECT SUM(end - start) AS total_coverage
+ FROM merged
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/operators/distance-operators.rst b/docs/dialect/distance-operators.rst
similarity index 55%
rename from docs/operators/distance-operators.rst
rename to docs/dialect/distance-operators.rst
index 7ceccf3..216bdcb 100644
--- a/docs/operators/distance-operators.rst
+++ b/docs/dialect/distance-operators.rst
@@ -1,5 +1,5 @@
-Distance and Proximity Operators
-================================
+Distance and Proximity
+======================
Distance and proximity operators calculate genomic distances and find nearest features.
These operators are essential for proximity analysis, such as finding genes near
@@ -7,7 +7,7 @@ regulatory elements or variants near transcription start sites.
.. contents::
:local:
- :depth: 2
+ :depth: 1
.. _distance-operator:
@@ -37,7 +37,7 @@ Parameters
~~~~~~~~~~
**interval_a**
- A genomic column registered with the engine.
+ A genomic column.
**interval_b**
Another genomic column to measure distance to.
@@ -56,52 +56,46 @@ Examples
Calculate distance between peaks and genes:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- p.name AS peak,
- g.name AS gene,
- DISTANCE(p.interval, g.interval) AS distance
- FROM peaks p
- CROSS JOIN genes g
- WHERE p.chromosome = g.chromosome
- ORDER BY p.name, distance
- """)
+ SELECT
+ p.name AS peak,
+ g.name AS gene,
+ DISTANCE(p.interval, g.interval) AS distance
+ FROM peaks p
+ CROSS JOIN genes g
+ WHERE p.chrom = g.chrom
+ ORDER BY p.name, distance
**Filter by Distance:**
Find features within 10kb of each other:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist
- FROM features_a a
- CROSS JOIN features_b b
- WHERE a.chromosome = b.chromosome
- AND DISTANCE(a.interval, b.interval) <= 10000
- """)
+ SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist
+ FROM features_a a
+ CROSS JOIN features_b b
+ WHERE a.chrom = b.chrom
+ AND DISTANCE(a.interval, b.interval) <= 10000
**Identify Overlapping vs. Proximal:**
Distinguish between overlapping and nearby features:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- p.name,
- g.name,
- CASE
- WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping'
- WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal'
- ELSE 'distant'
- END AS relationship
- FROM peaks p
- CROSS JOIN genes g
- WHERE p.chromosome = g.chromosome
- """)
+.. code-block:: sql
+
+ SELECT
+ p.name,
+ g.name,
+ CASE
+ WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping'
+ WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal'
+ ELSE 'distant'
+ END AS relationship
+ FROM peaks p
+ CROSS JOIN genes g
+ WHERE p.chrom = g.chrom
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -126,7 +120,7 @@ Backend Compatibility
Performance Notes
~~~~~~~~~~~~~~~~~
-- Always include ``WHERE a.chromosome = b.chromosome`` to avoid unnecessary
+- Always include ``WHERE a.chrom = b.chrom`` to avoid unnecessary
cross-chromosome comparisons
- For large datasets, consider pre-filtering by region before calculating distances
- Create indexes on chromosome and position columns for better performance
@@ -219,136 +213,124 @@ Examples
Find the 3 nearest genes for each peak:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Standalone Query:**
Find 5 nearest genes to a specific genomic location:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT gene_name, distance
- FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5)
- ORDER BY distance
- """)
+ SELECT gene_name, distance
+ FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5)
+ ORDER BY distance
**Distance-Constrained Search:**
Find nearest features within 100kb:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=5,
- max_distance=100000
- ) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=5,
+ max_distance=100000
+ ) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Strand-Specific Nearest Neighbors:**
Find nearest same-strand features:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.strand,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=3,
- stranded=true
- ) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.strand,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=3,
+ stranded=true
+ ) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Directional (Upstream/Downstream) Queries:**
Find upstream features using signed distances:
-.. code-block:: python
-
- # Upstream features have negative distances
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=10,
- signed=true
- ) AS nearest
- WHERE nearest.distance < 0
- ORDER BY peaks.name, nearest.distance DESC
- """)
-
- # Downstream features have positive distances
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=10,
- signed=true
- ) AS nearest
- WHERE nearest.distance > 0
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ -- Upstream features have negative distances
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=10,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance < 0
+ ORDER BY peaks.name, nearest.distance DESC
+
+.. code-block:: sql
+
+ -- Downstream features have positive distances
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=10,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance > 0
+ ORDER BY peaks.name, nearest.distance
**Combined Parameters:**
Find nearby same-strand features within distance constraints:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=5,
- max_distance=50000,
- stranded=true,
- signed=true
- ) AS nearest
- WHERE nearest.distance BETWEEN -10000 AND 10000
- ORDER BY peaks.name, ABS(nearest.distance)
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=5,
+ max_distance=50000,
+ stranded=true,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance BETWEEN -10000 AND 10000
+ ORDER BY peaks.name, ABS(nearest.distance)
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -376,15 +358,13 @@ Performance Notes
- **Chromosome pre-filtering**: NEAREST automatically filters by chromosome for efficiency
- **Use max_distance**: Specifying a maximum distance reduces the search space significantly
- **Limit k**: Only request as many neighbors as you actually need
-- **Create indexes**: Add indexes on ``(chromosome, start_pos, end_pos)`` for better performance
+- **Create indexes**: Add indexes on ``(chrom, start, "end")`` for better performance
-.. code-block:: python
+.. code-block:: sql
- # Create indexes for better NEAREST performance
- engine.conn.execute("""
- CREATE INDEX idx_genes_position
- ON genes (chromosome, start_pos, end_pos)
- """)
+ -- Create indexes for better NEAREST performance
+ CREATE INDEX idx_genes_position
+ ON genes (chrom, start, "end")
Related Operators
~~~~~~~~~~~~~~~~~
diff --git a/docs/operators/index.rst b/docs/dialect/index.rst
similarity index 89%
rename from docs/operators/index.rst
rename to docs/dialect/index.rst
index ce24f17..48e7bb2 100644
--- a/docs/operators/index.rst
+++ b/docs/dialect/index.rst
@@ -1,15 +1,13 @@
-GIQL Operators
-==============
+Operators
+=========
GIQL extends SQL with operators specifically designed for genomic interval queries.
These operators enable powerful spatial reasoning over genomic coordinates without
requiring complex SQL expressions.
-Operators are organized by functionality:
-
-.. contents::
- :local:
- :depth: 1
+Operators are organized by functionality. All operators work across supported
+database backends (DuckDB, SQLite, with PostgreSQL planned). Each operator page
+includes a compatibility table showing backend support status.
Spatial Relationship Operators
------------------------------
@@ -98,11 +96,6 @@ Apply operators to multiple ranges simultaneously.
See :doc:`quantifiers` for detailed documentation.
-Operator Compatibility
-----------------------
-
-All operators work across supported database backends (DuckDB, SQLite, with PostgreSQL planned).
-Each operator page includes a compatibility table showing backend support status.
.. toctree::
:maxdepth: 2
diff --git a/docs/operators/quantifiers.rst b/docs/dialect/quantifiers.rst
similarity index 61%
rename from docs/operators/quantifiers.rst
rename to docs/dialect/quantifiers.rst
index cffb71d..b10a38b 100644
--- a/docs/operators/quantifiers.rst
+++ b/docs/dialect/quantifiers.rst
@@ -7,7 +7,7 @@ specified ranges in a single query.
.. contents::
:local:
- :depth: 2
+ :depth: 1
.. _any-quantifier:
@@ -47,7 +47,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**ranges**
A comma-separated list of genomic range literals.
@@ -65,60 +65,52 @@ Examples
Find variants in any of several regions of interest:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS ANY(
- 'chr1:1000-2000',
- 'chr1:5000-6000',
- 'chr2:1000-3000'
- )
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS ANY(
+ 'chr1:1000-2000',
+ 'chr1:5000-6000',
+ 'chr2:1000-3000'
+ )
**Check Against Gene Promoters:**
Find features overlapping any of a set of promoter regions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM peaks
- WHERE interval INTERSECTS ANY(
- 'chr1:11869-12869', -- Gene A promoter
- 'chr1:29554-30554', -- Gene B promoter
- 'chr1:69091-70091' -- Gene C promoter
- )
- """)
+ SELECT * FROM peaks
+ WHERE interval INTERSECTS ANY(
+ 'chr1:11869-12869', -- Gene A promoter
+ 'chr1:29554-30554', -- Gene B promoter
+ 'chr1:69091-70091' -- Gene C promoter
+ )
**Combine with Other Filters:**
Filter by multiple regions and additional criteria:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000')
- AND quality >= 30
- AND filter = 'PASS'
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000')
+ AND quality >= 30
+ AND filter = 'PASS'
**Multi-Chromosome Query:**
Query across different chromosomes efficiently:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval INTERSECTS ANY(
- 'chr1:100000-200000',
- 'chr2:100000-200000',
- 'chr3:100000-200000',
- 'chrX:100000-200000'
- )
- """)
+ SELECT * FROM features
+ WHERE interval INTERSECTS ANY(
+ 'chr1:100000-200000',
+ 'chr2:100000-200000',
+ 'chr3:100000-200000',
+ 'chrX:100000-200000'
+ )
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -190,7 +182,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**ranges**
A comma-separated list of genomic range literals.
@@ -208,49 +200,43 @@ Examples
Find genes that contain all specified SNP positions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM genes
- WHERE interval CONTAINS ALL(
- 'chr1:1500',
- 'chr1:1600',
- 'chr1:1700'
- )
- """)
+ SELECT * FROM genes
+ WHERE interval CONTAINS ALL(
+ 'chr1:1500',
+ 'chr1:1600',
+ 'chr1:1700'
+ )
**Ensure Complete Coverage:**
Find intervals that span a set of required positions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval CONTAINS ALL(
- 'chr1:10000',
- 'chr1:15000',
- 'chr1:20000'
- )
- """)
+ SELECT * FROM features
+ WHERE interval CONTAINS ALL(
+ 'chr1:10000',
+ 'chr1:15000',
+ 'chr1:20000'
+ )
**Find Overlapping Regions:**
Find features that overlap with all specified windows (useful for finding
features in the intersection of multiple regions):
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval INTERSECTS ALL(
- 'chr1:1000-2000',
- 'chr1:1500-2500'
- )
- """)
+ SELECT * FROM features
+ WHERE interval INTERSECTS ALL(
+ 'chr1:1000-2000',
+ 'chr1:1500-2500'
+ )
- # This finds features that overlap BOTH ranges
- # (i.e., features in the intersection: chr1:1500-2000)
+ -- This finds features that overlap BOTH ranges
+ -- (i.e., features in the intersection: chr1:1500-2000)
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -285,22 +271,6 @@ Related
- :ref:`ANY ` - Match any range (logical OR)
- :ref:`CONTAINS ` - Base containment operator
-Choosing Between ANY and ALL
-----------------------------
-
-Use **ANY** when you want to find features that match at least one of several criteria:
-
-.. code-block:: python
-
- # Find variants in gene A OR gene B OR gene C
- WHERE interval INTERSECTS ANY('gene_a_region', 'gene_b_region', 'gene_c_region')
-
-Use **ALL** when you want to find features that satisfy all criteria simultaneously:
-
-.. code-block:: python
-
- # Find features that contain ALL of these positions
- WHERE interval CONTAINS ALL('pos1', 'pos2', 'pos3')
Common Patterns
---------------
@@ -309,24 +279,20 @@ Common Patterns
Find features that don't overlap any blacklisted region:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM peaks
- WHERE NOT interval INTERSECTS ANY(
- 'chr1:1000000-2000000', -- Centromere
- 'chr1:5000000-5500000' -- Known artifact region
- )
- """)
+ SELECT * FROM peaks
+ WHERE NOT interval INTERSECTS ANY(
+ 'chr1:1000000-2000000', -- Centromere
+ 'chr1:5000000-5500000' -- Known artifact region
+ )
**Combining ANY and ALL:**
Complex queries can combine both quantifiers:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
- AND interval CONTAINS ALL('chr1:1100', 'chr1:1200')
- """)
+ SELECT * FROM features
+ WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
+ AND interval CONTAINS ALL('chr1:1100', 'chr1:1200')
diff --git a/docs/operators/spatial-operators.rst b/docs/dialect/spatial-operators.rst
similarity index 75%
rename from docs/operators/spatial-operators.rst
rename to docs/dialect/spatial-operators.rst
index 6b48001..fa1c7be 100644
--- a/docs/operators/spatial-operators.rst
+++ b/docs/dialect/spatial-operators.rst
@@ -1,5 +1,5 @@
-Spatial Relationship Operators
-==============================
+Spatial Relationships
+=====================
Spatial relationship operators test positional relationships between genomic ranges.
These are the core operators for determining whether genomic intervals overlap,
@@ -7,7 +7,7 @@ contain, or are contained within other intervals.
.. contents::
:local:
- :depth: 2
+ :depth: 1
.. _intersects-operator:
@@ -46,7 +46,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine via ``register_table_schema()``.
+ A genomic column from a registered table.
**literal_range**
A string literal specifying a genomic range in the format ``'chromosome:start-end'``.
@@ -66,50 +66,42 @@ Examples
Find all variants that overlap a specific genomic region:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
**Column-to-Column Joins:**
Find variants that overlap with any gene:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- """)
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
**With WHERE Clause:**
Find overlapping features with additional filtering:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE v.quality >= 30
- AND g.biotype = 'protein_coding'
- """)
+ SELECT v.*, g.name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE v.quality >= 30
+ AND g.biotype = 'protein_coding'
**Left Outer Join:**
Find all variants, with gene information where available:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- LEFT JOIN genes g ON v.interval INTERSECTS g.interval
- """)
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ LEFT JOIN genes g ON v.interval INTERSECTS g.interval
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -134,7 +126,7 @@ Backend Compatibility
Performance Notes
~~~~~~~~~~~~~~~~~
-- Create indexes on ``(chromosome, start_pos, end_pos)`` for better join performance
+- Create indexes on ``(chrom, start, "end")`` for better join performance
- When joining large tables, consider filtering by chromosome first
- The generated SQL uses efficient range comparison predicates
@@ -183,7 +175,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**literal_range**
A string literal specifying a genomic point or range.
@@ -203,36 +195,30 @@ Examples
Find genes that contain a specific position:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM genes
- WHERE interval CONTAINS 'chr1:1500'
- """)
+ SELECT * FROM genes
+ WHERE interval CONTAINS 'chr1:1500'
**Range Containment:**
Find large features that fully contain smaller features:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT g.name AS gene_name, e.name AS exon_name
- FROM genes g
- INNER JOIN exons e ON g.interval CONTAINS e.interval
- """)
+ SELECT g.name AS gene_name, e.name AS exon_name
+ FROM genes g
+ INNER JOIN exons e ON g.interval CONTAINS e.interval
**Filtering Fully Contained Variants:**
Find variants that are completely within gene boundaries:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*
- FROM variants v
- INNER JOIN genes g ON g.interval CONTAINS v.interval
- """)
+ SELECT v.*
+ FROM variants v
+ INNER JOIN genes g ON g.interval CONTAINS v.interval
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
@@ -295,7 +281,7 @@ Parameters
~~~~~~~~~~
**interval**
- A genomic column registered with the engine.
+ A genomic column.
**literal_range**
A string literal specifying the containing range.
@@ -315,24 +301,20 @@ Examples
Find all features within a specific genomic window:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval WITHIN 'chr1:1000000-2000000'
- """)
+ SELECT * FROM features
+ WHERE interval WITHIN 'chr1:1000000-2000000'
**Find Nested Features:**
Find exons that are completely within their parent gene:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT e.*, g.name AS gene_name
- FROM exons e
- INNER JOIN genes g ON e.interval WITHIN g.interval
- """)
+ SELECT e.*, g.name AS gene_name
+ FROM exons e
+ INNER JOIN genes g ON e.interval WITHIN g.interval
Backend Compatibility
~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/reference/syntax-reference.rst b/docs/dialect/syntax-reference.rst
similarity index 78%
rename from docs/reference/syntax-reference.rst
rename to docs/dialect/syntax-reference.rst
index 48cfb14..0082e26 100644
--- a/docs/reference/syntax-reference.rst
+++ b/docs/dialect/syntax-reference.rst
@@ -5,7 +5,7 @@ Quick reference for GIQL syntax and operators.
.. contents::
:local:
- :depth: 2
+ :depth: 1
Genomic Range Literals
----------------------
@@ -238,7 +238,7 @@ Exclusion (NOT IN)
SELECT a.*
FROM table_a a
LEFT JOIN table_b b ON a.interval INTERSECTS b.interval
- WHERE b.chromosome IS NULL
+ WHERE b.chrom IS NULL
Count Overlaps
~~~~~~~~~~~~~~
@@ -248,7 +248,7 @@ Count Overlaps
SELECT a.*, COUNT(b.name) AS overlap_count
FROM table_a a
LEFT JOIN table_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome, a.start_pos, a.end_pos, ...
+ GROUP BY a.chrom, a.start, a."end", ...
K-Nearest Neighbors
~~~~~~~~~~~~~~~~~~~
@@ -266,7 +266,7 @@ Clustering
SELECT *, CLUSTER(interval) AS cluster_id
FROM table
- ORDER BY chromosome, start_pos
+ ORDER BY chrom, start
Merging
~~~~~~~
@@ -275,55 +275,3 @@ Merging
SELECT MERGE(interval), COUNT(*) AS count
FROM table
-
-Engine Methods
---------------
-
-execute()
-~~~~~~~~~
-
-Execute a GIQL query and return a cursor.
-
-.. code-block:: python
-
- cursor = engine.execute("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'")
-
-transpile()
-~~~~~~~~~~~
-
-Convert GIQL to SQL without executing.
-
-.. code-block:: python
-
- sql = engine.transpile("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'")
-
-register_table_schema()
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Register a table's schema for genomic operations.
-
-.. code-block:: python
-
- engine.register_table_schema(
- "table_name",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- chromosome_column="chromosome", # optional, default: "chromosome"
- start_column="start_pos", # optional, default: "start_pos"
- end_column="end_pos", # optional, default: "end_pos"
- )
-
-load_csv()
-~~~~~~~~~~
-
-Load a CSV file into a table.
-
-.. code-block:: python
-
- engine.load_csv("table_name", "file.csv")
- engine.load_csv("table_name", "file.tsv", delimiter="\t")
diff --git a/docs/guides/index.rst b/docs/guides/index.rst
index c3265be..b7644d1 100644
--- a/docs/guides/index.rst
+++ b/docs/guides/index.rst
@@ -6,27 +6,21 @@ and best practices for using GIQL effectively.
.. toctree::
:maxdepth: 2
+ :hidden:
schema-mapping
- multi-backend
+ engine
performance
- transpilation
-
-Guide Overview
---------------
:doc:`schema-mapping`
Learn how to configure GIQL to work with your genomic data, including
- registering table schemas and mapping logical genomic columns.
+ table configuration and mapping logical genomic columns.
-:doc:`multi-backend`
- Understand GIQL's multi-database support and how to work with different
- backends like DuckDB, SQLite, and PostgreSQL.
+:doc:`engine`
+ Understand how to use GIQL's transpiled SQL with different
+ execution engines like DuckDB, SQLite, and PostgreSQL.
:doc:`performance`
Optimize your GIQL queries for better performance with indexing strategies,
query patterns, and backend-specific tips.
-:doc:`transpilation`
- Understand how GIQL translates queries to SQL, debug query generation,
- and integrate transpiled SQL with external tools.
diff --git a/docs/guides/multi-backend.rst b/docs/guides/multi-backend.rst
deleted file mode 100644
index ecc3799..0000000
--- a/docs/guides/multi-backend.rst
+++ /dev/null
@@ -1,367 +0,0 @@
-Multi-Backend Guide
-===================
-
-GIQL supports multiple database backends, allowing you to run the same genomic
-queries against different database systems. This guide covers backend selection,
-configuration, and backend-specific considerations.
-
-.. contents::
- :local:
- :depth: 2
-
-Supported Backends
-------------------
-
-GIQL currently supports the following database backends:
-
-.. list-table::
- :header-rows: 1
- :widths: 20 20 60
-
- * - Backend
- - Status
- - Best For
- * - DuckDB
- - Full Support
- - Analytics, large datasets, in-memory processing
- * - SQLite
- - Full Support
- - Lightweight, embedded, portable databases
- * - PostgreSQL
- - Planned
- - Production deployments, shared databases
-
-Selecting a Backend
--------------------
-
-DuckDB (Recommended)
-~~~~~~~~~~~~~~~~~~~~
-
-DuckDB is the recommended backend for most use cases. It provides excellent
-performance for analytical queries and handles large genomic datasets efficiently.
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- # In-memory DuckDB (default)
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("features", "features.bed")
- # ... register schemas and query
-
- # Persistent DuckDB database
- with GIQLEngine(target_dialect="duckdb", db_path="my_data.duckdb") as engine:
- # Data persists between sessions
- pass
-
-**Advantages:**
-
-- Fast analytical query performance
-- Efficient columnar storage
-- Good support for large datasets
-- Rich SQL feature set
-- In-memory and persistent options
-
-**Best for:**
-
-- Interactive analysis
-- Large BED/VCF files
-- Complex aggregations
-- One-time analysis pipelines
-
-SQLite
-~~~~~~
-
-SQLite is a lightweight, embedded database suitable for smaller datasets or
-when portability is important.
-
-.. code-block:: python
-
- # In-memory SQLite
- with GIQLEngine(target_dialect="sqlite") as engine:
- pass
-
- # Persistent SQLite database
- with GIQLEngine(target_dialect="sqlite", db_path="my_data.db") as engine:
- pass
-
-**Advantages:**
-
-- Zero configuration
-- Single-file database
-- Widely compatible
-- Small memory footprint
-
-**Best for:**
-
-- Small to medium datasets
-- Portable analysis
-- Embedded applications
-- Simple workflows
-
-Backend Configuration
----------------------
-
-In-Memory vs Persistent
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Both DuckDB and SQLite support in-memory and persistent modes:
-
-.. code-block:: python
-
- # In-memory (data lost when engine closes)
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("features", "features.bed")
- # Data exists only during this session
-
- # Persistent (data saved to disk)
- with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine:
- engine.load_csv("features", "features.bed")
- # Data persists after engine closes
-
- # Reopen persistent database
- with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine:
- # Previous data is available
- cursor = engine.execute("SELECT * FROM features LIMIT 5")
-
-Connection Options
-~~~~~~~~~~~~~~~~~~
-
-Pass additional connection options to the underlying database:
-
-.. code-block:: python
-
- # DuckDB with custom settings
- with GIQLEngine(
- target_dialect="duckdb",
- db_path="analysis.duckdb",
- read_only=False,
- ) as engine:
- pass
-
-Writing Portable Queries
-------------------------
-
-Query Compatibility
-~~~~~~~~~~~~~~~~~~~
-
-GIQL queries are portable across backends. The same query works on any
-supported database:
-
-.. code-block:: python
-
- query = """
- SELECT a.*, b.name AS gene
- FROM variants a
- JOIN genes b ON a.interval INTERSECTS b.interval
- WHERE a.quality >= 30
- """
-
- # Works on DuckDB
- with GIQLEngine(target_dialect="duckdb") as engine:
- # ... setup ...
- cursor = engine.execute(query)
-
- # Same query works on SQLite
- with GIQLEngine(target_dialect="sqlite") as engine:
- # ... setup ...
- cursor = engine.execute(query)
-
-SQL Dialect Differences
-~~~~~~~~~~~~~~~~~~~~~~~
-
-While GIQL queries are portable, the generated SQL differs between backends.
-Use ``transpile()`` to see the backend-specific SQL:
-
-.. code-block:: python
-
- query = "SELECT * FROM features WHERE interval INTERSECTS 'chr1:1000-2000'"
-
- # DuckDB SQL
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema("features", {...}, genomic_column="interval")
- print(engine.transpile(query))
-
- # SQLite SQL (may differ slightly)
- with GIQLEngine(target_dialect="sqlite") as engine:
- engine.register_table_schema("features", {...}, genomic_column="interval")
- print(engine.transpile(query))
-
-Backend-Specific Features
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some SQL features may only be available on certain backends:
-
-.. list-table::
- :header-rows: 1
- :widths: 40 20 20 20
-
- * - Feature
- - DuckDB
- - SQLite
- - Notes
- * - Window functions
- - Yes
- - Yes
- - Full support
- * - CTEs (WITH clause)
- - Yes
- - Yes
- - Full support
- * - LATERAL joins
- - Yes
- - Limited
- - Used by NEAREST
- * - STRING_AGG
- - Yes
- - GROUP_CONCAT
- - Different function names
-
-Migrating Between Backends
---------------------------
-
-Exporting Data
-~~~~~~~~~~~~~~
-
-Export data from one backend for import into another:
-
-.. code-block:: python
-
- # Export from DuckDB
- with GIQLEngine(target_dialect="duckdb", db_path="source.duckdb") as engine:
- cursor = engine.execute("SELECT * FROM features")
- import pandas as pd
- df = pd.DataFrame(cursor.fetchall(),
- columns=[desc[0] for desc in cursor.description])
- df.to_csv("features_export.csv", index=False)
-
- # Import to SQLite
- with GIQLEngine(target_dialect="sqlite", db_path="target.db") as engine:
- engine.load_csv("features", "features_export.csv")
- engine.register_table_schema("features", {...}, genomic_column="interval")
-
-Schema Compatibility
-~~~~~~~~~~~~~~~~~~~~
-
-Ensure schema definitions work across backends:
-
-.. code-block:: python
-
- # Use portable type names
- schema = {
- "chromosome": "VARCHAR", # Works on all backends
- "start_pos": "BIGINT", # Maps to appropriate integer type
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "FLOAT", # Maps to appropriate float type
- }
-
- # Same schema works on both backends
- for dialect in ["duckdb", "sqlite"]:
- with GIQLEngine(target_dialect=dialect) as engine:
- engine.load_csv("features", "features.csv")
- engine.register_table_schema("features", schema, genomic_column="interval")
-
-Performance Comparison
-----------------------
-
-Backend Performance Characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. list-table::
- :header-rows: 1
- :widths: 30 35 35
-
- * - Operation
- - DuckDB
- - SQLite
- * - Large table scans
- - Excellent (columnar)
- - Good
- * - Complex joins
- - Excellent
- - Good
- * - Aggregations
- - Excellent
- - Good
- * - Small queries
- - Good
- - Excellent
- * - Memory usage
- - Higher
- - Lower
- * - Startup time
- - Faster
- - Fast
-
-Choosing the Right Backend
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Choose DuckDB when:**
-
-- Working with large datasets (millions of features)
-- Running complex analytical queries
-- Performing heavy aggregations
-- Memory is not constrained
-
-**Choose SQLite when:**
-
-- Working with smaller datasets
-- Need maximum portability
-- Memory is constrained
-- Simple query patterns
-
-Using External Connections
---------------------------
-
-Connecting to Existing Databases
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Connect to databases created outside of GIQL:
-
-.. code-block:: python
-
- # Connect to existing DuckDB database
- with GIQLEngine(target_dialect="duckdb", db_path="existing.duckdb") as engine:
- # Register schemas for existing tables
- engine.register_table_schema(
- "my_existing_table",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- )
-
- # Query existing data with GIQL operators
- cursor = engine.execute("""
- SELECT * FROM my_existing_table
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
-Using Transpiled SQL Externally
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Generate SQL for use with external database connections:
-
-.. code-block:: python
-
- import duckdb
-
- # Get transpiled SQL from GIQL
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema("features", {...}, genomic_column="interval")
- sql = engine.transpile("""
- SELECT * FROM features
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Execute with external connection
- conn = duckdb.connect("my_database.duckdb")
- result = conn.execute(sql).fetchall()
- conn.close()
-
-This is useful when integrating GIQL with existing database workflows or
-when you need more control over the database connection.
diff --git a/docs/guides/performance.rst b/docs/guides/performance.rst
index c0c4e51..019416e 100644
--- a/docs/guides/performance.rst
+++ b/docs/guides/performance.rst
@@ -6,7 +6,7 @@ indexing, query patterns, and backend-specific optimizations.
.. contents::
:local:
- :depth: 2
+ :depth: 1
Understanding Query Performance
-------------------------------
@@ -14,11 +14,11 @@ Understanding Query Performance
How GIQL Queries Execute
~~~~~~~~~~~~~~~~~~~~~~~~
-When you execute a GIQL query:
+When you use GIQL:
1. GIQL parses the query and identifies genomic operators
-2. Operators are expanded into standard SQL predicates
-3. The SQL is sent to the database backend
+2. Operators are expanded into SQL predicates
+3. You execute the SQL on your database backend
4. The database executes the query using its optimizer
Performance depends on both the generated SQL and how the database executes it.
@@ -39,19 +39,11 @@ Creating Indexes
Create indexes on genomic columns for faster queries:
-.. code-block:: python
-
- # DuckDB
- engine.conn.execute("""
- CREATE INDEX idx_features_position
- ON features (chromosome, start_pos, end_pos)
- """)
+.. code-block:: sql
- # SQLite
- engine.conn.execute("""
- CREATE INDEX idx_features_position
- ON features (chromosome, start_pos, end_pos)
- """)
+ -- DuckDB or SQLite
+ CREATE INDEX idx_features_position
+ ON features (chrom, start, "end")
Recommended Index Patterns
~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -60,21 +52,21 @@ Recommended Index Patterns
.. code-block:: sql
- CREATE INDEX idx_table_position ON table_name (chromosome, start_pos, end_pos)
+ CREATE INDEX idx_table_position ON table_name (chrom, start, "end")
**For join queries:**
.. code-block:: sql
-- Index both tables involved in joins
- CREATE INDEX idx_variants_position ON variants (chromosome, start_pos, end_pos)
- CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos)
+ CREATE INDEX idx_variants_position ON variants (chrom, start, "end")
+ CREATE INDEX idx_genes_position ON genes (chrom, start, "end")
**For strand-specific queries:**
.. code-block:: sql
- CREATE INDEX idx_features_strand ON features (chromosome, strand, start_pos, end_pos)
+ CREATE INDEX idx_features_strand ON features (chrom, strand, start, "end")
When to Create Indexes
~~~~~~~~~~~~~~~~~~~~~~
@@ -100,88 +92,55 @@ Pre-filter by Chromosome
Always include chromosome filtering when joining tables:
-.. code-block:: python
+.. code-block:: sql
- # Good: Explicit chromosome filter
- cursor = engine.execute("""
- SELECT a.*, b.name
- FROM features_a a
- JOIN features_b b ON a.interval INTERSECTS b.interval
- WHERE a.chromosome = 'chr1'
- """)
-
- # Also good: Cross-chromosome join with implicit filtering
- # GIQL handles this, but explicit is clearer
- cursor = engine.execute("""
- SELECT a.*, b.name
- FROM features_a a
- JOIN features_b b ON a.interval INTERSECTS b.interval
- AND a.chromosome = b.chromosome
- """)
+ -- Good: Explicit chromosome filter
+ SELECT a.*, b.name
+ FROM features_a a
+ JOIN features_b b ON a.interval INTERSECTS b.interval
+ WHERE a.chrom = 'chr1'
Use Selective Filters Early
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Apply selective filters before joins:
-.. code-block:: python
+.. code-block:: sql
- # Good: Filter before joining
- cursor = engine.execute("""
- WITH filtered_variants AS (
- SELECT * FROM variants
- WHERE quality >= 30 AND filter = 'PASS'
- )
- SELECT f.*, g.name
- FROM filtered_variants f
- JOIN genes g ON f.interval INTERSECTS g.interval
- """)
-
- # Less efficient: Filter after joining
- cursor = engine.execute("""
- SELECT v.*, g.name
- FROM variants v
- JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE v.quality >= 30 AND v.filter = 'PASS'
- """)
+ -- Good: Filter before joining
+ WITH filtered_variants AS (
+ SELECT * FROM variants
+ WHERE quality >= 30 AND filter = 'PASS'
+ )
+ SELECT f.*, g.name
+ FROM filtered_variants f
+ JOIN genes g ON f.interval INTERSECTS g.interval
Limit Result Sets
~~~~~~~~~~~~~~~~~
Use LIMIT for exploratory queries:
-.. code-block:: python
+.. code-block:: sql
- # Good: Limit results during exploration
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000000-2000000'
- LIMIT 100
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000000-2000000'
+ LIMIT 100
Use DISTINCT Wisely
~~~~~~~~~~~~~~~~~~~
DISTINCT can be expensive. Only use when necessary:
-.. code-block:: python
+.. code-block:: sql
- # Only use DISTINCT when you actually need unique rows
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a
- JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
-
- # If you just need to check existence, use EXISTS instead
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a
- WHERE EXISTS (
- SELECT 1 FROM features_b b
- WHERE a.interval INTERSECTS b.interval
- )
- """)
+ -- If you just need to check existence, use EXISTS instead
+ SELECT a.*
+ FROM features_a a
+ WHERE EXISTS (
+ SELECT 1 FROM features_b b
+ WHERE a.interval INTERSECTS b.interval
+ )
NEAREST Query Optimization
--------------------------
@@ -193,35 +152,32 @@ The NEAREST operator can be expensive for large datasets. Optimize with:
**1. Use max_distance to limit search space:**
-.. code-block:: python
+.. code-block:: sql
- # Good: Constrained search
- cursor = engine.execute("""
- SELECT peaks.name, nearest.name, nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=5,
- max_distance=100000 -- Only search within 100kb
- ) AS nearest
- """)
+ SELECT peaks.name, nearest.name, nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=5,
+ max_distance=100000 -- Only search within 100kb
+ ) AS nearest
**2. Request only the k you need:**
-.. code-block:: python
+.. code-block:: sql
- # Good: Request exactly what you need
+ -- Good: Request exactly what you need
NEAREST(genes, reference=peaks.interval, k=3)
- # Wasteful: Request more than needed
+ -- Wasteful: Request more than needed
NEAREST(genes, reference=peaks.interval, k=100)
**3. Index the target table:**
.. code-block:: sql
- CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos)
+ CREATE INDEX idx_genes_position ON genes (chrom, start, "end")
Merge and Cluster Optimization
------------------------------
@@ -231,34 +187,28 @@ Efficient Clustering
For large datasets, consider pre-sorting:
-.. code-block:: python
+.. code-block:: sql
- # Pre-sort data for clustering
- cursor = engine.execute("""
- WITH sorted AS (
- SELECT * FROM features
- ORDER BY chromosome, start_pos
- )
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM sorted
- """)
+ WITH sorted AS (
+ SELECT * FROM features
+ ORDER BY chrom, start
+ )
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM sorted
Efficient Merging
~~~~~~~~~~~~~~~~~
Filter before merging to reduce data volume:
-.. code-block:: python
+.. code-block:: sql
- # Good: Filter first, then merge
- cursor = engine.execute("""
- WITH filtered AS (
- SELECT * FROM features
- WHERE score >= 10
- )
- SELECT MERGE(interval), COUNT(*) AS count
- FROM filtered
- """)
+ WITH filtered AS (
+ SELECT * FROM features
+ WHERE score >= 10
+ )
+ SELECT MERGE(interval), COUNT(*) AS count
+ FROM filtered
Analyzing Query Performance
---------------------------
@@ -266,43 +216,24 @@ Analyzing Query Performance
Using EXPLAIN
~~~~~~~~~~~~~
-Analyze query execution plans:
+Analyze query execution plans by running EXPLAIN on the transpiled SQL:
.. code-block:: python
- # Get the transpiled SQL
- sql = engine.transpile("""
+ from giql import transpile
+
+ sql = transpile(
+ """
SELECT a.*, b.name
FROM variants a
JOIN genes b ON a.interval INTERSECTS b.interval
- """)
-
- # Analyze the execution plan
- cursor = engine.execute(f"EXPLAIN {sql}")
- for row in cursor:
- print(row)
+ """,
+ tables=["variants", "genes"],
+ )
+ # Run EXPLAIN on your database connection
+ # conn.execute(f"EXPLAIN {sql}")
# DuckDB also supports EXPLAIN ANALYZE for actual timing
- cursor = engine.execute(f"EXPLAIN ANALYZE {sql}")
-
-Timing Queries
-~~~~~~~~~~~~~~
-
-Measure query execution time:
-
-.. code-block:: python
-
- import time
-
- start = time.time()
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000000-2000000'
- """)
- results = cursor.fetchall()
- elapsed = time.time() - start
-
- print(f"Query returned {len(results)} rows in {elapsed:.2f} seconds")
Backend-Specific Tips
---------------------
@@ -314,21 +245,12 @@ DuckDB Optimizations
DuckDB is columnar, so queries that select few columns are faster:
-.. code-block:: python
-
- # Faster: Select only needed columns
- cursor = engine.execute("""
- SELECT chromosome, start_pos, end_pos, name
- FROM features
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
+.. code-block:: sql
- # Slower: Select all columns
- cursor = engine.execute("""
- SELECT *
- FROM features
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
+ -- Faster: Select only needed columns
+ SELECT chrom, start, "end", name
+ FROM features
+ WHERE interval INTERSECTS 'chr1:1000-2000'
**Parallel execution:**
@@ -344,50 +266,13 @@ SQLite Optimizations
-- Include commonly selected columns in the index
CREATE INDEX idx_features_covering
- ON features (chromosome, start_pos, end_pos, name, score)
+ ON features (chrom, start, "end", name, score)
**Analyze tables:**
-.. code-block:: python
-
- # Help SQLite's query planner
- engine.conn.execute("ANALYZE features")
-
-Memory Management
------------------
-
-Streaming Results
-~~~~~~~~~~~~~~~~~
-
-For large result sets, iterate instead of fetching all:
-
-.. code-block:: python
-
- # Good: Stream results
- cursor = engine.execute("SELECT * FROM large_table")
- for row in cursor:
- process(row)
-
- # Memory-intensive: Fetch all at once
- cursor = engine.execute("SELECT * FROM large_table")
- all_rows = cursor.fetchall() # Loads everything into memory
-
-Batch Processing
-~~~~~~~~~~~~~~~~
-
-Process large datasets in batches:
-
-.. code-block:: python
-
- chromosomes = ['chr1', 'chr2', 'chr3', ...] # All chromosomes
+.. code-block:: sql
- for chrom in chromosomes:
- cursor = engine.execute(f"""
- SELECT * FROM features
- WHERE chromosome = '{chrom}'
- AND interval INTERSECTS '{chrom}:1-1000000'
- """)
- process_chromosome(cursor)
+ ANALYZE features
Performance Checklist
---------------------
@@ -396,13 +281,13 @@ Before running large queries, check:
.. code-block:: text
- □ Indexes created on genomic columns
- □ Chromosome filtering included in joins
- □ Selective filters applied early
- □ LIMIT used for exploration
- □ Only necessary columns selected
- □ NEAREST queries use max_distance
- □ Results streamed instead of fetched all at once
+ - Indexes created on genomic columns
+ - Chromosome filtering included in joins
+ - Selective filters applied early
+ - LIMIT used for exploration
+ - Only necessary columns selected
+ - NEAREST queries use max_distance
+ - Results streamed instead of fetched all at once
Quick Wins
~~~~~~~~~~
diff --git a/docs/guides/quickstart.rst b/docs/guides/quickstart.rst
new file mode 100644
index 0000000..ef7c3ae
--- /dev/null
+++ b/docs/guides/quickstart.rst
@@ -0,0 +1,175 @@
+Quick Start
+===========
+
+GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing
+you to express complex genomic range operations without writing intricate
+SQL expressions. GIQL queries read naturally, making your analysis code
+easier to review and share. GIQL operators follow established conventions
+around genomic spatial relationships, so the semantics are familiar and
+predictable.
+
+- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships
+- **Distance operators**: DISTANCE, NEAREST for proximity queries
+- **Aggregation operators**: CLUSTER, MERGE for combining intervals
+- **Set quantifiers**: ANY, ALL for multi-range queries
+- **Range parsing**: Understands genomic range strings and coordinate systems
+- **Transpilation**: Converts GIQL to standard SQL-92 compatible output for execution on any backend
+
+Installation
+------------
+
+Install GIQL using pip:
+
+.. code-block:: bash
+
+ pip install giql
+
+Basic Usage
+-----------
+
+Table Configuration
+~~~~~~~~~~~~~~~~~~~
+
+GIQL works with genomic data stored in tables with separate columns for chromosome,
+start position, and end position. The default column names are:
+
+* **chrom**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX')
+* **start**: Start position of the genomic interval (0-based, inclusive)
+* **end**: End position of the genomic interval (0-based, exclusive, half-open)
+* **strand** (optional): Strand orientation ('+', '-', or '.')
+
+If your table uses the default column names, you can pass just the table name
+as a string. For custom column names, use a ``Table`` object:
+
+.. code-block:: python
+
+ from giql import Table, transpile
+
+ # Default column names (chrom, start, end, strand)
+ sql = transpile(query, tables=["peaks"])
+
+ # Custom column names
+ sql = transpile(
+ query,
+ tables=[
+ Table(
+ "variants",
+ genomic_col="interval",
+ chrom_col="chromosome",
+ start_col="start_pos",
+ end_col="end_pos",
+ )
+ ],
+ )
+
+After configuration, you can use the genomic pseudo-column (default: ``interval``)
+in your GIQL queries, and the transpiler will automatically expand it to the
+physical column comparisons.
+
+Query with DuckDB
+~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import duckdb
+ from giql import transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
+
+ conn = duckdb.connect()
+ conn.execute("CREATE TABLE variants AS SELECT * FROM read_csv('variants.csv')")
+ df = conn.execute(sql).fetchdf()
+
+Query with SQLite
+~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import sqlite3
+ from giql import transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
+
+ conn = sqlite3.connect("data.db")
+ cursor = conn.execute(sql)
+ for row in cursor:
+ print(row)
+
+Spatial Operators
+-----------------
+
+INTERSECTS
+~~~~~~~~~~
+
+Check if genomic ranges overlap:
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+
+CONTAINS
+~~~~~~~~
+
+Check if a range contains a point or another range:
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE interval CONTAINS 'chr1:1500'
+
+WITHIN
+~~~~~~
+
+Check if a range is within another range:
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE interval WITHIN 'chr1:1000-5000'
+
+Set Quantifiers
+---------------
+
+ANY
+~~~
+
+Match any of the specified ranges:
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
+
+ALL
+~~~
+
+Match all of the specified ranges:
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600')
+
+Column-to-Column Joins
+----------------------
+
+Join tables on genomic position:
+
+.. code-block:: sql
+
+ SELECT v.*, g.name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
diff --git a/docs/guides/schema-mapping.rst b/docs/guides/schema-mapping.rst
index f515695..43c580c 100644
--- a/docs/guides/schema-mapping.rst
+++ b/docs/guides/schema-mapping.rst
@@ -1,20 +1,20 @@
-Schema Mapping Guide
-====================
+Schema Mapping
+==============
This guide explains how to configure GIQL to work with your genomic data by
-registering table schemas and mapping logical genomic columns.
+defining table configurations that map logical genomic columns to physical columns.
.. contents::
:local:
- :depth: 2
+ :depth: 1
Understanding Schema Mapping
----------------------------
GIQL needs to know how your genomic data is structured in order to translate
-genomic operators into SQL. This is done through schema registration, which
-maps a logical "genomic column" (used in your queries) to the physical columns
-in your database tables.
+genomic operators into SQL. This is done through ``Table`` objects, which
+map a logical "genomic column" (used in your queries) to the physical columns
+in your files, data frames, or database tables.
The Core Concept
~~~~~~~~~~~~~~~~
@@ -30,188 +30,126 @@ Behind the scenes, GIQL expands this to actual column comparisons:
.. code-block:: sql
SELECT * FROM variants
- WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000
+ WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000
-Schema registration tells GIQL which physical columns (``chromosome``,
-``start_pos``, ``end_pos``) correspond to the logical ``interval`` column.
+The ``Table`` configuration tells GIQL which physical columns (``chrom``,
+``start``, ``end``) correspond to the logical ``interval`` column.
-Registering Table Schemas
--------------------------
+Configuring Tables
+------------------
-Basic Registration
-~~~~~~~~~~~~~~~~~~
+Basic Configuration
+~~~~~~~~~~~~~~~~~~~
-Register a table schema using ``register_table_schema()``:
+For tables that use the default column names (``chrom``, ``start``, ``end``,
+``strand``), pass the table name as a string:
.. code-block:: python
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Load data
- engine.load_csv("variants", "variants.csv")
-
- # Register schema
- engine.register_table_schema(
- "variants", # Table name
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "quality": "FLOAT",
- },
- genomic_column="interval", # Logical column name for queries
- )
-
- # Now you can use 'interval' in queries
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
-Required Columns
-~~~~~~~~~~~~~~~~
-
-For schema registration, your table must have columns that map to:
+ from giql import transpile
-- **chromosome**: The chromosome/contig identifier (e.g., 'chr1', 'chrX')
-- **start_pos**: The start position of the genomic interval (0-based, inclusive)
-- **end_pos**: The end position of the genomic interval (0-based, exclusive)
-
-GIQL looks for these column names by default. If your columns have different
-names, see :ref:`custom-column-names`.
-
-Optional Strand Column
-~~~~~~~~~~~~~~~~~~~~~~
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
-If your data includes strand information, include it in the schema:
+Default Columns
+~~~~~~~~~~~~~~~
-.. code-block:: python
+GIQL uses these default column names:
- engine.register_table_schema(
- "features",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "strand": "VARCHAR", # '+', '-', or '.'
- "name": "VARCHAR",
- },
- genomic_column="interval",
- )
+- **chrom**: The chromosome/contig identifier (e.g., 'chr1', 'chrX')
+- **start**: The start position of the genomic interval (0-based, inclusive)
+- **end**: The end position of the genomic interval (0-based, exclusive)
+- **strand**: Strand orientation ('+', '-', or '.'), optional
-The strand column enables strand-specific operations in operators like
-CLUSTER and NEAREST.
+The default genomic pseudo-column name is ``interval``.
.. _custom-column-names:
Custom Column Names
~~~~~~~~~~~~~~~~~~~
-If your table uses different column names for genomic coordinates, specify
-the mapping explicitly:
+If your table uses different column names, create a ``Table`` object with
+the mapping:
.. code-block:: python
- engine.register_table_schema(
- "my_table",
- {
- "chrom": "VARCHAR", # Your chromosome column
- "chromStart": "BIGINT", # Your start column (UCSC-style)
- "chromEnd": "BIGINT", # Your end column
- "name": "VARCHAR",
- },
- genomic_column="interval",
- chromosome_column="chrom", # Map to your column name
- start_column="chromStart", # Map to your column name
- end_column="chromEnd", # Map to your column name
+ from giql import Table, transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM my_table
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=[
+ Table(
+ "my_table",
+ chrom_col="chrom", # Your chromosome column
+ start_col="chromStart", # Your start column (UCSC-style)
+ end_col="chromEnd", # Your end column
+ )
+ ],
)
Multiple Tables
---------------
-Register Multiple Tables
-~~~~~~~~~~~~~~~~~~~~~~~~
+Configuring Multiple Tables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Register all tables that will participate in genomic queries:
+Pass all tables that participate in genomic queries:
.. code-block:: python
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Load data files
- engine.load_csv("variants", "variants.bed")
- engine.load_csv("genes", "genes.bed")
- engine.load_csv("regulatory", "regulatory.bed")
-
- # Define common schema
- bed_schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "FLOAT",
- "strand": "VARCHAR",
- }
-
- # Register each table
- for table in ["variants", "genes", "regulatory"]:
- engine.register_table_schema(
- table,
- bed_schema,
- genomic_column="interval",
- )
+ from giql import transpile
- # Now you can join tables using genomic operators
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- JOIN genes g ON v.interval INTERSECTS g.interval
- """)
+ # Tables with default column names
+ sql = transpile(
+ """
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ JOIN genes g ON v.interval INTERSECTS g.interval
+ """,
+ tables=["variants", "genes"],
+ )
Different Schemas Per Table
~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Tables can have different schemas and even different genomic column names:
+Tables can have different column names and even different genomic column names.
+Mix strings (for default columns) with ``Table`` objects (for custom columns):
.. code-block:: python
- # Variants table with VCF-style columns
- engine.register_table_schema(
- "variants",
- {
- "CHROM": "VARCHAR",
- "POS": "BIGINT",
- "END": "BIGINT",
- "ID": "VARCHAR",
- "QUAL": "FLOAT",
- },
- genomic_column="var_interval",
- chromosome_column="CHROM",
- start_column="POS",
- end_column="END",
- )
-
- # Genes table with BED-style columns
- engine.register_table_schema(
- "genes",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "gene_name": "VARCHAR",
- "strand": "VARCHAR",
- },
- genomic_column="gene_interval",
- )
+ from giql import Table, transpile
- # Query using different genomic column names
- cursor = engine.execute("""
+ sql = transpile(
+ """
SELECT v.ID, g.gene_name
FROM variants v
JOIN genes g ON v.var_interval INTERSECTS g.gene_interval
- """)
+ """,
+ tables=[
+ # VCF-style columns
+ Table(
+ "variants",
+ genomic_col="var_interval",
+ chrom_col="CHROM",
+ start_col="POS",
+ end_col="END",
+ strand_col=None,
+ ),
+ # BED-style columns (defaults)
+ Table(
+ "genes",
+ genomic_col="gene_interval",
+ ),
+ ],
+ )
Coordinate Systems
------------------
@@ -219,7 +157,7 @@ Coordinate Systems
Understanding BED Coordinates
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-GIQL uses the BED coordinate convention:
+GIQL uses the BED coordinate convention by default:
- **0-based start**: The first base of a chromosome is position 0
- **Half-open intervals**: Start is inclusive, end is exclusive
@@ -227,35 +165,25 @@ GIQL uses the BED coordinate convention:
Example: An interval ``chr1:100-200`` covers bases 100 through 199 (100 bases total).
-Converting from 1-Based Coordinates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Working with 1-Based Coordinates
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If your data uses 1-based coordinates (like VCF or GFF), convert when loading:
+If your data uses 1-based coordinates (like VCF or GFF), configure the
+``Table`` accordingly:
.. code-block:: python
- import pandas as pd
-
- # Load 1-based data
- df = pd.read_csv("variants.vcf", sep="\t")
-
- # Convert to 0-based
- df['start_pos'] = df['POS'] - 1 # Convert 1-based to 0-based
- df['end_pos'] = df['POS'] # For SNPs, end = start + 1
+ from giql import Table, transpile
- # Load into engine
- engine.conn.execute("CREATE TABLE variants AS SELECT * FROM df")
-
- # Register schema
- engine.register_table_schema(
- "variants",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- # ... other columns
- },
- genomic_column="interval",
+ sql = transpile(
+ query,
+ tables=[
+ Table(
+ "variants",
+ coordinate_system="1based",
+ interval_type="closed",
+ )
+ ],
)
Working with Point Features
@@ -267,144 +195,9 @@ For point features (like SNPs), create an interval of length 1:
# For a SNP at position 1000 (1-based)
# 0-based interval: [999, 1000)
- start_pos = 999
- end_pos = 1000
-
-Data Types
-----------
-
-Recommended Column Types
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-For optimal performance, use appropriate data types:
-
-.. list-table::
- :header-rows: 1
- :widths: 25 25 50
-
- * - Column
- - Recommended Type
- - Notes
- * - chromosome
- - VARCHAR
- - String type for chromosome names
- * - start_pos
- - BIGINT
- - 64-bit integer for large genomes
- * - end_pos
- - BIGINT
- - 64-bit integer for large genomes
- * - strand
- - VARCHAR(1) or CHAR(1)
- - Single character: '+', '-', '.'
- * - score
- - FLOAT or DOUBLE
- - Numeric scores
- * - name
- - VARCHAR
- - Feature identifiers
-
-Type Compatibility
-~~~~~~~~~~~~~~~~~~
-
-GIQL schemas use SQL type names. Common mappings:
-
-.. list-table::
- :header-rows: 1
- :widths: 30 35 35
-
- * - GIQL Schema Type
- - DuckDB Type
- - SQLite Type
- * - INTEGER
- - INTEGER
- - INTEGER
- * - BIGINT
- - BIGINT
- - INTEGER
- * - VARCHAR
- - VARCHAR
- - TEXT
- * - FLOAT
- - FLOAT
- - REAL
- * - DOUBLE
- - DOUBLE
- - REAL
-
-Loading Data
-------------
-
-From CSV Files
-~~~~~~~~~~~~~~
-
-Load CSV files directly:
-
-.. code-block:: python
-
- engine.load_csv("features", "features.csv")
-
- # With custom options
- engine.load_csv(
- "features",
- "features.tsv",
- delimiter="\t",
- header=True,
- )
-
-From Pandas DataFrames
-~~~~~~~~~~~~~~~~~~~~~~
-
-Load data from pandas:
-
-.. code-block:: python
-
- import pandas as pd
-
- df = pd.read_csv("features.bed", sep="\t", header=None,
- names=["chromosome", "start_pos", "end_pos", "name"])
-
- # Register the DataFrame as a table
- engine.conn.execute("CREATE TABLE features AS SELECT * FROM df")
-
- # Then register the schema
- engine.register_table_schema(
- "features",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- )
+ start = 999
+ end = 1000
-From Existing Database Tables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If tables already exist in your database, just register their schemas:
-
-.. code-block:: python
-
- # Connect to existing database
- with GIQLEngine(target_dialect="duckdb", db_path="my_database.duckdb") as engine:
- # Register schemas for existing tables
- engine.register_table_schema(
- "existing_table",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- )
-
- # Query existing data
- cursor = engine.execute("""
- SELECT * FROM existing_table
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
Troubleshooting
---------------
@@ -414,32 +207,17 @@ Common Issues
**"Unknown column" errors:**
-- Ensure the table schema is registered before querying
-- Check that the genomic column name in your query matches the registered name
-- Verify column names in the schema match actual table columns
+- Ensure the table is included in the ``tables`` parameter
+- Check that the genomic column name in your query matches the configured name
+- Verify column names in the ``Table`` object match actual table columns
**Incorrect results:**
- Verify your coordinate system (0-based vs 1-based)
-- Check that start_pos < end_pos for all intervals
+- Check that start < end for all intervals
- Ensure chromosome names match between tables (e.g., 'chr1' vs '1')
**Performance issues:**
- See the :doc:`performance` guide for optimization tips
-- Consider adding indexes on genomic columns
-
-Verifying Schema Registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Check that schemas are registered correctly:
-
-.. code-block:: python
-
- # After registration, test with a simple query
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
- print(sql)
- # Should show expanded SQL with chromosome, start_pos, end_pos comparisons
+- Consider adding indexes on genomic columns
\ No newline at end of file
diff --git a/docs/guides/transpilation.rst b/docs/guides/transpilation.rst
deleted file mode 100644
index bd4c24a..0000000
--- a/docs/guides/transpilation.rst
+++ /dev/null
@@ -1,417 +0,0 @@
-Transpilation Guide
-===================
-
-GIQL works by transpiling genomic queries into standard SQL. This guide explains
-how transpilation works, how to debug query generation, and how to use transpiled
-SQL with external tools.
-
-.. contents::
- :local:
- :depth: 2
-
-How Transpilation Works
------------------------
-
-The Transpilation Process
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When you write a GIQL query:
-
-.. code-block:: sql
-
- SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'
-
-GIQL performs these steps:
-
-1. **Parse**: Parse the SQL to identify GIQL-specific operators
-2. **Expand**: Replace genomic operators with standard SQL predicates
-3. **Generate**: Produce SQL for the target database dialect
-
-The result is standard SQL:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000
-
-Operator Expansion
-~~~~~~~~~~~~~~~~~~
-
-Each GIQL operator expands to specific SQL patterns:
-
-**INTERSECTS** expands to range overlap checks:
-
-.. code-block:: sql
-
- -- GIQL
- a.interval INTERSECTS b.interval
-
- -- SQL (same chromosome, overlapping ranges)
- a.chromosome = b.chromosome
- AND a.start_pos < b.end_pos
- AND a.end_pos > b.start_pos
-
-**CONTAINS** expands to containment checks:
-
-.. code-block:: sql
-
- -- GIQL
- a.interval CONTAINS b.interval
-
- -- SQL
- a.chromosome = b.chromosome
- AND a.start_pos <= b.start_pos
- AND a.end_pos >= b.end_pos
-
-**DISTANCE** expands to gap calculations:
-
-.. code-block:: sql
-
- -- GIQL
- DISTANCE(a.interval, b.interval)
-
- -- SQL (simplified)
- CASE
- WHEN a.chromosome != b.chromosome THEN NULL
- WHEN a.end_pos <= b.start_pos THEN b.start_pos - a.end_pos
- WHEN b.end_pos <= a.start_pos THEN a.start_pos - b.end_pos
- ELSE 0
- END
-
-Using the Transpile Method
---------------------------
-
-Basic Transpilation
-~~~~~~~~~~~~~~~~~~~
-
-Use ``transpile()`` to see generated SQL without executing:
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema(
- "variants",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- },
- genomic_column="interval",
- )
-
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- print(sql)
- # Output: SELECT * FROM variants
- # WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000
-
-Transpiling Complex Queries
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Transpilation works with all GIQL features:
-
-.. code-block:: python
-
- # Join query
- sql = engine.transpile("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE v.quality >= 30
- """)
- print(sql)
-
- # NEAREST query
- sql = engine.transpile("""
- SELECT peaks.name, nearest.name, nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest
- """)
- print(sql)
-
- # Aggregation query
- sql = engine.transpile("""
- SELECT MERGE(interval), COUNT(*) AS count
- FROM features
- """)
- print(sql)
-
-Debugging with Transpilation
-----------------------------
-
-Understanding Query Expansion
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use transpilation to understand what GIQL does:
-
-.. code-block:: python
-
- # See how ANY quantifier expands
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000')
- """)
- print(sql)
- # Shows the OR conditions for each range
-
- # See how join conditions expand
- sql = engine.transpile("""
- SELECT a.*, b.name
- FROM features_a a
- JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
- print(sql)
- # Shows the full range comparison predicates
-
-Verbose Mode
-~~~~~~~~~~~~
-
-Enable verbose mode for detailed transpilation information:
-
-.. code-block:: python
-
- with GIQLEngine(target_dialect="duckdb", verbose=True) as engine:
- engine.register_table_schema("variants", {...}, genomic_column="interval")
-
- # Transpilation will print detailed information
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Execution also shows transpilation details
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
-Troubleshooting Transpilation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Query not expanding correctly:**
-
-.. code-block:: python
-
- # Check that schema is registered
- sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'")
- if "interval INTERSECTS" in sql:
- print("Schema not registered for 'variants' table")
-
-**Wrong column names in output:**
-
-.. code-block:: python
-
- # Verify column mapping
- engine.register_table_schema(
- "variants",
- {...},
- genomic_column="interval",
- chromosome_column="chrom", # Check these match your table
- start_column="start",
- end_column="end",
- )
-
-Comparing Dialects
-------------------
-
-Same Query, Different SQL
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See how the same query translates for different backends:
-
-.. code-block:: python
-
- query = """
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """
-
- schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- }
-
- # DuckDB
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema("variants", schema, genomic_column="interval")
- print("DuckDB SQL:")
- print(engine.transpile(query))
- print()
-
- # SQLite
- with GIQLEngine(target_dialect="sqlite") as engine:
- engine.register_table_schema("variants", schema, genomic_column="interval")
- print("SQLite SQL:")
- print(engine.transpile(query))
-
-Dialect-Specific Differences
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some queries may generate different SQL for different dialects:
-
-- String functions may use different names
-- Type casting syntax may vary
-- Window function support may differ
-
-GIQL handles these differences automatically, but understanding them helps
-when debugging or integrating with external tools.
-
-Using Transpiled SQL Externally
--------------------------------
-
-With External Database Connections
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use transpiled SQL with your own database connections:
-
-.. code-block:: python
-
- import duckdb
-
- # Generate SQL using GIQL
- with GIQLEngine(target_dialect="duckdb") as giql_engine:
- giql_engine.register_table_schema("variants", {...}, genomic_column="interval")
- sql = giql_engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Execute with external connection
- conn = duckdb.connect("my_database.duckdb")
- result = conn.execute(sql).fetchall()
- conn.close()
-
-With ORMs and Query Builders
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Integrate transpiled SQL with SQLAlchemy or other ORMs:
-
-.. code-block:: python
-
- from sqlalchemy import create_engine, text
-
- # Generate SQL
- with GIQLEngine(target_dialect="duckdb") as giql_engine:
- giql_engine.register_table_schema("variants", {...}, genomic_column="interval")
- sql = giql_engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Execute with SQLAlchemy
- sa_engine = create_engine("duckdb:///my_database.duckdb")
- with sa_engine.connect() as conn:
- result = conn.execute(text(sql))
- for row in result:
- print(row)
-
-Building SQL Pipelines
-~~~~~~~~~~~~~~~~~~~~~~
-
-Use transpilation in data pipelines:
-
-.. code-block:: python
-
- def build_intersection_query(table_a, table_b, region):
- """Generate SQL for intersection query."""
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema(table_a, {...}, genomic_column="interval")
- engine.register_table_schema(table_b, {...}, genomic_column="interval")
-
- return engine.transpile(f"""
- SELECT a.*, b.name
- FROM {table_a} a
- JOIN {table_b} b ON a.interval INTERSECTS b.interval
- WHERE a.interval INTERSECTS '{region}'
- """)
-
- # Use in pipeline
- sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000")
- # Execute sql with your preferred method
-
-Saving Queries
-~~~~~~~~~~~~~~
-
-Save transpiled SQL for documentation or reuse:
-
-.. code-block:: python
-
- # Generate and save SQL
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.register_table_schema("variants", {...}, genomic_column="interval")
-
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- with open("query.sql", "w") as f:
- f.write(sql)
-
- # Later, execute saved SQL
- with open("query.sql") as f:
- sql = f.read()
-
- conn = duckdb.connect("database.duckdb")
- result = conn.execute(sql).fetchall()
-
-Advanced Transpilation
-----------------------
-
-Parameterized Queries
-~~~~~~~~~~~~~~~~~~~~~
-
-Build queries with parameters:
-
-.. code-block:: python
-
- def query_region(engine, chrom, start, end):
- """Query a parameterized region."""
- region = f"{chrom}:{start}-{end}"
- return engine.execute(f"""
- SELECT * FROM variants
- WHERE interval INTERSECTS '{region}'
- """)
-
- # Use with different regions
- cursor = query_region(engine, "chr1", 1000000, 2000000)
- cursor = query_region(engine, "chr2", 5000000, 6000000)
-
-Dynamic Query Building
-~~~~~~~~~~~~~~~~~~~~~~
-
-Build queries programmatically:
-
-.. code-block:: python
-
- def build_multi_table_query(tables, target_region):
- """Build a query that unions results from multiple tables."""
- union_parts = []
- for table in tables:
- union_parts.append(f"""
- SELECT *, '{table}' AS source FROM {table}
- WHERE interval INTERSECTS '{target_region}'
- """)
-
- query = " UNION ALL ".join(union_parts)
- return engine.transpile(query)
-
-Inspecting the AST
-~~~~~~~~~~~~~~~~~~
-
-For advanced debugging, you can inspect the parsed query:
-
-.. code-block:: python
-
- # GIQL uses sqlglot internally
- # The transpiled SQL shows the final result
- sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'")
-
- # For deep debugging, examine the generated SQL structure
- print(sql)
diff --git a/docs/index.rst b/docs/index.rst
index 9918a00..417faad 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,140 +1,56 @@
-GIQL - Genomic Interval Query Language
+Genomic Interval Query Language (GIQL)
======================================
-**GIQL** is a SQL dialect for genomic range queries with multi-database support.
-
-Genomic analysis often requires repetitive, complex SQL patterns to express simple operations like finding overlapping intervals or merging features. GIQL extends SQL with dedicated operators for these common tasks, so you can declaratively express *what* you want to compute without getting lost in SQL boilerplate. GIQL queries read naturally, even without SQL expertise - this clarity makes your analysis code easier to review and share. Best of all, GIQL queries work across DuckDB, SQLite, PostgreSQL, and other databases, so you're never locked into a specific engine and can choose the tool that fits your use case. Finally, GIQL operators follow established conventions from tools like bedtools, so the semantics are familiar and predictable.
-
.. toctree::
- :maxdepth: 2
- :caption: Getting Started
+ :hidden:
- quickstart
+ Home
+ guides/quickstart
-.. toctree::
- :maxdepth: 2
- :caption: Operator Reference
+**GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations.
- operators/index
+See the :doc:`guides/quickstart` to get started.
+
+Dialect
+-------
+GIQL extends the SQL query language with dedicated constructs for these
+common tasks, allowing you to declare *what* you want to compute rather
+than how. Whether you're filtering variants by genomic region, finding
+overlapping features, or calculating distances between intervals, GIQL
+makes these operations intuitive and portable.
.. toctree::
- :maxdepth: 2
- :caption: Guides
+ :maxdepth: 1
+ :caption: Dialect
- guides/index
+ dialect/index
+ dialect/syntax-reference
-.. toctree::
- :maxdepth: 2
- :caption: Recipes
-
- recipes/index
+Transpilation
+-------------
+The ``giql`` package *transpiles* queries written in GIQL to regular SQL
+for use in existing database systems and analytics engines.
.. toctree::
- :maxdepth: 2
- :caption: Reference
-
- reference/syntax-reference
- api/index
-
-Quick Start
------------
-
-Install GIQL:
-
-.. code-block:: bash
-
- pip install giql
-
-Basic usage:
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- # Create engine with DuckDB backend
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Load genomic data
- engine.load_csv("variants", "variants.csv")
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- },
- genomic_column="interval",
- )
-
- # Query with genomic operators (returns cursor for streaming)
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Process results
- for row in cursor:
- print(row)
+ :maxdepth: 1
+ :caption: Transpilation
- # Or just transpile to SQL without executing
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
- print(sql) # See the generated SQL
+ transpilation/index
+ transpilation/execution
+ transpilation/api-reference
-Features
---------
-* **SQL-based**: Familiar SQL syntax with genomic extensions
-* **Multi-backend**: Works with DuckDB, SQLite, and more
-* **Spatial operators**: INTERSECTS, CONTAINS, WITHIN, DISTANCE, NEAREST
-* **Aggregation operators**: CLUSTER, MERGE for combining intervals
-* **Set quantifiers**: ANY, ALL for multi-range queries
-* **Column-to-column joins**: Join tables on genomic position
-* **Transpilation**: Convert GIQL to standard SQL for debugging or external use
+Learn more
+----------
+See the following guides to learn how to use GIQL effectively:
-Operators at a Glance
----------------------
-
-**Spatial Relationships:**
-
-.. code-block:: sql
-
- -- Find overlapping features
- WHERE interval INTERSECTS 'chr1:1000-2000'
-
- -- Find containing/contained features
- WHERE gene.interval CONTAINS variant.interval
-
-**Distance and Proximity:**
-
-.. code-block:: sql
-
- -- Calculate distance between intervals
- SELECT DISTANCE(a.interval, b.interval) AS dist
-
- -- Find k-nearest neighbors
- FROM peaks CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5)
-
-**Aggregation:**
-
-.. code-block:: sql
-
- -- Cluster overlapping intervals
- SELECT *, CLUSTER(interval) AS cluster_id FROM features
-
- -- Merge overlapping intervals
- SELECT MERGE(interval) FROM features
-
-**Set Quantifiers:**
-
-.. code-block:: sql
+.. toctree::
+ :maxdepth: 1
+ :caption: Guides and Recipes
- -- Match any of multiple regions
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000')
+ guides/index
+ recipes/index
-See :doc:`operators/index` for complete operator documentation.
Indices and tables
==================
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
deleted file mode 100644
index 9560c34..0000000
--- a/docs/quickstart.rst
+++ /dev/null
@@ -1,228 +0,0 @@
-Quick Start
-===========
-
-Installation
-------------
-
-Install GIQL using pip:
-
-.. code-block:: bash
-
- pip install giql
-
-Or with optional dependencies:
-
-.. code-block:: bash
-
- pip install giql[duckdb] # For DuckDB support
-
-Basic Usage
------------
-
-Expected Schema
-~~~~~~~~~~~~~~~
-
-GIQL works with genomic data stored in tables with separate columns for chromosome,
-start position, and end position. The typical schema includes:
-
-* **chromosome**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX')
-* **start_pos**: Start position of the genomic interval (0-based, inclusive)
-* **end_pos**: End position of the genomic interval (0-based, exclusive, half-open)
-* **strand** (optional): Strand orientation ('+', '-', or '.')
-
-You must register the table schema with GIQL, mapping the logical genomic column
-(used in queries) to the physical columns in your table:
-
-.. code-block:: python
-
- engine.register_table_schema(
- "table_name",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "strand": "VARCHAR", # Optional
- # ... other columns ...
- },
- genomic_column="interval", # Logical name used in queries
- )
-
-After registration, you can use ``interval`` in your GIQL queries, and the engine
-will automatically map it to the ``chromosome``, ``start_pos``, and ``end_pos``
-columns.
-
-Query with DuckDB
-~~~~~~~~~~~~~~~~~
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Load CSV file into database
- engine.load_csv("variants", "variants.csv")
-
- # Register schema mapping
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- },
- genomic_column="interval",
- )
-
- # Query using the logical 'interval' column (returns cursor for streaming)
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Process results lazily
- for row in cursor:
- print(row)
-
- # Or materialize to pandas DataFrame
- import pandas as pd
- cursor = engine.execute("SELECT ...")
- df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
-
-Query with SQLite
-~~~~~~~~~~~~~~~~~
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine:
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- # Iterate results
- for row in cursor:
- print(row)
-
-Spatial Operators
------------------
-
-INTERSECTS
-~~~~~~~~~~
-
-Check if genomic ranges overlap:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
-
-CONTAINS
-~~~~~~~~
-
-Check if a range contains a point or another range:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE interval CONTAINS 'chr1:1500'
-
-WITHIN
-~~~~~~
-
-Check if a range is within another range:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE interval WITHIN 'chr1:1000-5000'
-
-Set Quantifiers
----------------
-
-ANY
-~~~
-
-Match any of the specified ranges:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
-
-ALL
-~~~
-
-Match all of the specified ranges:
-
-.. code-block:: sql
-
- SELECT * FROM variants
- WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600')
-
-Column-to-Column Joins
-----------------------
-
-Join tables on genomic position:
-
-.. code-block:: sql
-
- SELECT v.*, g.name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
-
-Transpiling to SQL
-------------------
-
-The ``transpile()`` method converts GIQL queries to standard SQL without executing them.
-This is useful for debugging, understanding the generated SQL, or integrating with external tools:
-
-.. code-block:: python
-
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Register table schema
- engine.register_table_schema(
- "variants",
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- },
- genomic_column="interval",
- )
-
- # Transpile GIQL to SQL
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
- print(sql)
- # Output: SELECT * FROM variants WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000
-
-Different target dialects generate different SQL:
-
-.. code-block:: python
-
- # DuckDB dialect
- with GIQLEngine(target_dialect="duckdb") as engine:
- sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'")
- # Generates DuckDB-optimized SQL
-
- # SQLite dialect
- with GIQLEngine(target_dialect="sqlite") as engine:
- sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'")
- # Generates SQLite-compatible SQL
-
-The transpiled SQL can be executed directly on your database or used with other tools.
-Use ``verbose=True`` when creating the engine to see detailed transpilation information:
-
-.. code-block:: python
-
- with GIQLEngine(target_dialect="duckdb", verbose=True) as engine:
- sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'")
- # Prints detailed information about the transpilation process
diff --git a/docs/recipes/advanced-queries.rst b/docs/recipes/advanced-queries.rst
index 2aaf944..62147f6 100644
--- a/docs/recipes/advanced-queries.rst
+++ b/docs/recipes/advanced-queries.rst
@@ -16,16 +16,14 @@ Match Any of Multiple Regions
Find features overlapping any of several regions of interest:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS ANY(
- 'chr1:1000000-2000000',
- 'chr1:5000000-6000000',
- 'chr2:1000000-3000000'
- )
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS ANY(
+ 'chr1:1000000-2000000',
+ 'chr1:5000000-6000000',
+ 'chr2:1000000-3000000'
+ )
**Use case:** Query multiple regions of interest in a single statement.
@@ -34,16 +32,14 @@ Match All of Multiple Points
Find features containing all specified positions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM genes
- WHERE interval CONTAINS ALL(
- 'chr1:1500',
- 'chr1:1600',
- 'chr1:1700'
- )
- """)
+ SELECT * FROM genes
+ WHERE interval CONTAINS ALL(
+ 'chr1:1500',
+ 'chr1:1600',
+ 'chr1:1700'
+ )
**Use case:** Find genes spanning a set of SNP positions.
@@ -52,16 +48,14 @@ Exclude Multiple Regions
Find features that don't overlap any blacklisted region:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM peaks
- WHERE NOT interval INTERSECTS ANY(
- 'chr1:120000000-125000000', -- Centromere region
- 'chr1:140000000-142000000', -- Known artifact
- 'chrM:1-16569' -- Mitochondrial
- )
- """)
+ SELECT * FROM peaks
+ WHERE NOT interval INTERSECTS ANY(
+ 'chr1:120000000-125000000', -- Centromere region
+ 'chr1:140000000-142000000', -- Known artifact
+ 'chrM:1-16569' -- Mitochondrial
+ )
**Use case:** Filter out features in problematic genomic regions.
@@ -70,13 +64,11 @@ Combine ANY and ALL
Complex multi-range logic:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM features
- WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
- AND interval CONTAINS ALL('chr1:1100', 'chr1:1200')
- """)
+ SELECT * FROM features
+ WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')
+ AND interval CONTAINS ALL('chr1:1100', 'chr1:1200')
**Use case:** Find features matching complex spatial criteria.
@@ -88,18 +80,16 @@ Multi-Attribute Filtering
Combine spatial and attribute filters:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name, g.biotype
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE v.quality >= 30
- AND v.filter = 'PASS'
- AND v.allele_frequency > 0.01
- AND g.biotype = 'protein_coding'
- ORDER BY v.chromosome, v.start_pos
- """)
+ SELECT v.*, g.name AS gene_name, g.biotype
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE v.quality >= 30
+ AND v.filter = 'PASS'
+ AND v.allele_frequency > 0.01
+ AND g.biotype = 'protein_coding'
+ ORDER BY v.chrom, v.start
**Use case:** Extract high-quality variants in protein-coding genes.
@@ -108,18 +98,16 @@ Target Gene Lists
Filter to specific genes of interest:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE g.name IN (
- 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS',
- 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM'
- )
- ORDER BY g.name, v.start_pos
- """)
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE g.name IN (
+ 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS',
+ 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM'
+ )
+ ORDER BY g.name, v.start
**Use case:** Extract variants in clinically actionable genes.
@@ -128,22 +116,20 @@ Conditional Logic
Apply different criteria based on feature type:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT v.*, g.name, g.biotype,
- CASE
- WHEN g.biotype = 'protein_coding' THEN 'coding'
- WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA'
- ELSE 'other'
- END AS gene_category
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE CASE
- WHEN g.biotype = 'protein_coding' THEN v.quality >= 30
- ELSE v.quality >= 20
- END
- """)
+.. code-block:: sql
+
+ SELECT v.*, g.name, g.biotype,
+ CASE
+ WHEN g.biotype = 'protein_coding' THEN 'coding'
+ WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA'
+ ELSE 'other'
+ END AS gene_category
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE CASE
+ WHEN g.biotype = 'protein_coding' THEN v.quality >= 30
+ ELSE v.quality >= 20
+ END
**Use case:** Apply different quality thresholds based on genomic context.
@@ -155,19 +141,17 @@ Per-Chromosome Statistics
Calculate summary statistics by chromosome:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.chromosome,
- COUNT(DISTINCT a.name) AS total_features,
- COUNT(b.name) AS total_overlaps,
- COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome
- ORDER BY a.chromosome
- """)
+ SELECT
+ a.chrom,
+ COUNT(DISTINCT a.name) AS total_features,
+ COUNT(b.name) AS total_overlaps,
+ COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom
+ ORDER BY a.chrom
**Use case:** Compare feature distribution across chromosomes.
@@ -176,19 +160,17 @@ Overlap Statistics
Calculate overlap metrics:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.chromosome,
- COUNT(*) AS overlap_count,
- AVG(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS avg_overlap_bp,
- SUM(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS total_overlap_bp
- FROM features_a a
- INNER JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome
- ORDER BY a.chromosome
- """)
+ SELECT
+ a.chrom,
+ COUNT(*) AS overlap_count,
+ AVG(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS avg_overlap_bp,
+ SUM(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS total_overlap_bp
+ FROM features_a a
+ INNER JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom
+ ORDER BY a.chrom
**Use case:** Quantify overlap patterns across the genome.
@@ -197,19 +179,17 @@ Feature Size Distribution
Analyze feature sizes by category:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- biotype,
- COUNT(*) AS count,
- AVG(end_pos - start_pos) AS avg_length,
- MIN(end_pos - start_pos) AS min_length,
- MAX(end_pos - start_pos) AS max_length
- FROM genes
- GROUP BY biotype
- ORDER BY count DESC
- """)
+ SELECT
+ biotype,
+ COUNT(*) AS count,
+ AVG(end - start) AS avg_length,
+ MIN(end - start) AS min_length,
+ MAX(end - start) AS max_length
+ FROM genes
+ GROUP BY biotype
+ ORDER BY count DESC
**Use case:** Compare size distributions across feature types.
@@ -221,14 +201,12 @@ Three-Way Intersection
Find features overlapping in all three tables:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a
- INNER JOIN features_b b ON a.interval INTERSECTS b.interval
- INNER JOIN features_c c ON a.interval INTERSECTS c.interval
- """)
+ SELECT DISTINCT a.*
+ FROM features_a a
+ INNER JOIN features_b b ON a.interval INTERSECTS b.interval
+ INNER JOIN features_c c ON a.interval INTERSECTS c.interval
**Use case:** Find consensus regions across multiple datasets.
@@ -237,19 +215,17 @@ Hierarchical Annotations
Join multiple annotation levels:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- v.name AS variant,
- e.name AS exon,
- t.name AS transcript,
- g.name AS gene
- FROM variants v
- INNER JOIN exons e ON v.interval INTERSECTS e.interval
- INNER JOIN transcripts t ON e.interval WITHIN t.interval
- INNER JOIN genes g ON t.interval WITHIN g.interval
- """)
+ SELECT
+ v.name AS variant,
+ e.name AS exon,
+ t.name AS transcript,
+ g.name AS gene
+ FROM variants v
+ INNER JOIN exons e ON v.interval INTERSECTS e.interval
+ INNER JOIN transcripts t ON e.interval WITHIN t.interval
+ INNER JOIN genes g ON t.interval WITHIN g.interval
**Use case:** Build hierarchical annotations for variants.
@@ -258,26 +234,24 @@ Union with Deduplication
Combine features from multiple sources:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH all_peaks AS (
- SELECT *, 'chip_seq' AS source FROM chip_peaks
- UNION ALL
- SELECT *, 'atac_seq' AS source FROM atac_peaks
- UNION ALL
- SELECT *, 'dnase_seq' AS source FROM dnase_peaks
- )
- SELECT
- chromosome,
- start_pos,
- end_pos,
- STRING_AGG(DISTINCT source, ',') AS sources,
- COUNT(DISTINCT source) AS source_count
- FROM all_peaks
- GROUP BY chromosome, start_pos, end_pos
- HAVING COUNT(DISTINCT source) >= 2
- """)
+.. code-block:: sql
+
+ WITH all_peaks AS (
+ SELECT *, 'chip_seq' AS source FROM chip_peaks
+ UNION ALL
+ SELECT *, 'atac_seq' AS source FROM atac_peaks
+ UNION ALL
+ SELECT *, 'dnase_seq' AS source FROM dnase_peaks
+ )
+ SELECT
+ chrom,
+ start,
+ end,
+ STRING_AGG(DISTINCT source, ',') AS sources,
+ COUNT(DISTINCT source) AS source_count
+ FROM all_peaks
+ GROUP BY chrom, start, end
+ HAVING COUNT(DISTINCT source) >= 2
**Use case:** Find regulatory regions supported by multiple assays.
@@ -289,15 +263,13 @@ Filtered Subquery
Use subqueries to pre-filter data:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*
- FROM variants v
- WHERE v.interval INTERSECTS ANY(
- SELECT position FROM genes WHERE biotype = 'protein_coding'
- )
- """)
+ SELECT v.*
+ FROM variants v
+ WHERE v.interval INTERSECTS ANY(
+ SELECT position FROM genes WHERE biotype = 'protein_coding'
+ )
**Use case:** Intersect with dynamically filtered reference data.
@@ -310,35 +282,33 @@ Chained CTEs
Build complex analyses with Common Table Expressions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH
- -- Step 1: Find high-quality variants
- hq_variants AS (
- SELECT * FROM variants
- WHERE quality >= 30 AND filter = 'PASS'
- ),
- -- Step 2: Annotate with genes
- annotated AS (
- SELECT v.*, g.name AS gene_name, g.biotype
- FROM hq_variants v
- LEFT JOIN genes g ON v.interval INTERSECTS g.interval
- ),
- -- Step 3: Summarize by gene
- gene_summary AS (
- SELECT
- gene_name,
- biotype,
- COUNT(*) AS variant_count
- FROM annotated
- WHERE gene_name IS NOT NULL
- GROUP BY gene_name, biotype
- )
- SELECT * FROM gene_summary
- ORDER BY variant_count DESC
- LIMIT 20
- """)
+ WITH
+ -- Step 1: Find high-quality variants
+ hq_variants AS (
+ SELECT * FROM variants
+ WHERE quality >= 30 AND filter = 'PASS'
+ ),
+ -- Step 2: Annotate with genes
+ annotated AS (
+ SELECT v.*, g.name AS gene_name, g.biotype
+ FROM hq_variants v
+ LEFT JOIN genes g ON v.interval INTERSECTS g.interval
+ ),
+ -- Step 3: Summarize by gene
+ gene_summary AS (
+ SELECT
+ gene_name,
+ biotype,
+ COUNT(*) AS variant_count
+ FROM annotated
+ WHERE gene_name IS NOT NULL
+ GROUP BY gene_name, biotype
+ )
+ SELECT * FROM gene_summary
+ ORDER BY variant_count DESC
+ LIMIT 20
**Use case:** Build multi-step analysis pipelines in a single query.
@@ -350,22 +320,20 @@ Rank Overlaps
Rank features by their overlap characteristics:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.name,
- a.chromosome,
- a.start_pos,
- overlap_count,
- RANK() OVER (ORDER BY overlap_count DESC) AS rank
- FROM (
- SELECT a.*, COUNT(b.name) AS overlap_count
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand
- ) a
- """)
+ SELECT
+ a.name,
+ a.chrom,
+ a.start,
+ overlap_count,
+ RANK() OVER (ORDER BY overlap_count DESC) AS rank
+ FROM (
+ SELECT a.*, COUNT(b.name) AS overlap_count
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand
+ ) a
**Use case:** Identify features with the most overlaps.
@@ -374,21 +342,19 @@ Running Totals
Calculate cumulative coverage:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- chromosome,
- start_pos,
- end_pos,
- end_pos - start_pos AS length,
- SUM(end_pos - start_pos) OVER (
- PARTITION BY chromosome
- ORDER BY start_pos
- ) AS cumulative_bp
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ chrom,
+ start,
+ end,
+ end - start AS length,
+ SUM(end - start) OVER (
+ PARTITION BY chrom
+ ORDER BY start
+ ) AS cumulative_bp
+ FROM features
+ ORDER BY chrom, start
**Use case:** Track cumulative coverage along each chromosome.
@@ -398,35 +364,20 @@ Debugging and Optimization
View Generated SQL
~~~~~~~~~~~~~~~~~~
-Use transpile() to see the SQL GIQL generates:
+Use ``transpile()`` to see the SQL GIQL generates:
.. code-block:: python
- sql = engine.transpile("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
+ from giql import transpile
+
+ sql = transpile(
+ "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["variants"],
+ )
print(sql)
- # See the actual SQL that will be executed
**Use case:** Debug queries or understand GIQL's translation.
-Verbose Mode
-~~~~~~~~~~~~
-
-Enable detailed logging:
-
-.. code-block:: python
-
- with GIQLEngine(target_dialect="duckdb", verbose=True) as engine:
- # All queries will print transpilation details
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000-2000'
- """)
-
-**Use case:** Diagnose query translation issues.
-
Explain Query Plan
~~~~~~~~~~~~~~~~~~
@@ -434,16 +385,18 @@ Analyze query execution:
.. code-block:: python
- # First transpile to get the SQL
- sql = engine.transpile("""
+ from giql import transpile
+
+ sql = transpile(
+ """
SELECT v.*, g.name
FROM variants v
JOIN genes g ON v.interval INTERSECTS g.interval
- """)
+ """,
+ tables=["variants", "genes"],
+ )
# Then use database-native EXPLAIN
- cursor = engine.execute(f"EXPLAIN {sql}")
- for row in cursor:
- print(row)
+ # e.g., conn.execute(f"EXPLAIN {sql}")
**Use case:** Optimize slow queries by examining execution plans.
diff --git a/docs/recipes/bedtools-migration.rst b/docs/recipes/bedtools-migration.rst
index 74c27bd..4a00011 100644
--- a/docs/recipes/bedtools-migration.rst
+++ b/docs/recipes/bedtools-migration.rst
@@ -20,19 +20,19 @@ Quick Reference Table
- GIQL Equivalent
- Recipe
* - ``intersect -a A -b B``
- - ``SELECT DISTINCT a.* FROM a, b WHERE a.pos INTERSECTS b.pos``
+ - ``SELECT DISTINCT a.* FROM a, b WHERE a.interval INTERSECTS b.interval``
- :ref:`intersect-basic`
* - ``intersect -a A -b B -wa``
- - ``SELECT a.* FROM a, b WHERE a.pos INTERSECTS b.pos``
+ - ``SELECT a.* FROM a, b WHERE a.interval INTERSECTS b.interval``
- :ref:`intersect-wa`
* - ``intersect -a A -b B -wb``
- - ``SELECT b.* FROM a, b WHERE a.pos INTERSECTS b.pos``
+ - ``SELECT b.* FROM a, b WHERE a.interval INTERSECTS b.interval``
- :ref:`intersect-wb`
* - ``intersect -a A -b B -wa -wb``
- - ``SELECT a.*, b.* FROM a, b WHERE a.pos INTERSECTS b.pos``
+ - ``SELECT a.*, b.* FROM a, b WHERE a.interval INTERSECTS b.interval``
- :ref:`intersect-wawb`
* - ``intersect -a A -b B -v``
- - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chr IS NULL``
+ - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chrom IS NULL``
- :ref:`intersect-v`
* - ``intersect -a A -b B -u``
- ``SELECT DISTINCT a.* FROM a JOIN b ...``
@@ -47,10 +47,10 @@ Quick Reference Table
- ``SELECT a.*, b.* FROM a LEFT JOIN b ...``
- :ref:`intersect-loj`
* - ``closest -a A -b B -k N``
- - ``CROSS JOIN LATERAL NEAREST(b, reference=a.pos, k=N)``
+ - ``CROSS JOIN LATERAL NEAREST(b, reference=a.interval, k=N)``
- :ref:`closest-k`
* - ``closest -a A -b B -d``
- - ``SELECT ..., DISTANCE(a.pos, b.pos) ...``
+ - ``SELECT ..., DISTANCE(a.interval, b.interval) ...``
- :ref:`closest-d`
* - ``cluster -i A``
- ``SELECT *, CLUSTER(interval) AS cluster_id FROM a``
@@ -84,13 +84,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT DISTINCT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
.. _intersect-wa:
@@ -105,13 +103,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
.. _intersect-wb:
@@ -126,13 +122,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT b.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT b.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
.. _intersect-wawb:
@@ -147,13 +141,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT a.*, b.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
.. _intersect-v:
@@ -168,14 +160,12 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- WHERE b.chromosome IS NULL
- """)
+ SELECT a.*
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ WHERE b.chrom IS NULL
.. _intersect-u:
@@ -190,13 +180,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a
- INNER JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT DISTINCT a.*
+ FROM features_a a
+ INNER JOIN features_b b ON a.interval INTERSECTS b.interval
.. _intersect-c:
@@ -211,14 +199,12 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, COUNT(b.name) AS overlap_count
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand
- """)
+ SELECT a.*, COUNT(b.name) AS overlap_count
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand
.. _intersect-wo:
@@ -233,16 +219,14 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.*,
- b.*,
- (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT
+ a.*,
+ b.*,
+ (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
.. _intersect-wao:
@@ -257,19 +241,17 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.*,
- b.*,
- CASE
- WHEN b.chromosome IS NULL THEN 0
- ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
- END AS overlap_bp
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT
+ a.*,
+ b.*,
+ CASE
+ WHEN b.chrom IS NULL THEN 0
+ ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start)
+ END AS overlap_bp
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
.. _intersect-loj:
@@ -284,13 +266,11 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.*
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT a.*, b.*
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
``-s``: Same strand overlaps only
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -303,14 +283,12 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand = b.strand
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND a.strand = b.strand
``-S``: Opposite strand overlaps only
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -323,16 +301,14 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand != b.strand
- AND a.strand IN ('+', '-')
- AND b.strand IN ('+', '-')
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND a.strand != b.strand
+ AND a.strand IN ('+', '-')
+ AND b.strand IN ('+', '-')
``-f``: Minimum overlap fraction of A
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -345,16 +321,14 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND (
- LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
- ) >= 0.5 * (a.end_pos - a.start_pos)
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND (
+ LEAST(a.end, b.end) - GREATEST(a.start, b.start)
+ ) >= 0.5 * (a.end - a.start)
``-r``: Reciprocal overlap
~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -367,23 +341,21 @@ Default: Report overlaps between A and B
**GIQL:**
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH overlap_calcs AS (
- SELECT
- a.*,
- (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp,
- (a.end_pos - a.start_pos) AS a_length,
- (b.end_pos - b.start_pos) AS b_length
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- )
- SELECT chromosome, start_pos, end_pos, name, score, strand
- FROM overlap_calcs
- WHERE overlap_bp >= 0.5 * a_length
- AND overlap_bp >= 0.5 * b_length
- """)
+.. code-block:: sql
+
+ WITH overlap_calcs AS (
+ SELECT
+ a.*,
+ (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp,
+ (a.end - a.start) AS a_length,
+ (b.end - b.start) AS b_length
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ )
+ SELECT chrom, start, end, name, score, strand
+ FROM overlap_calcs
+ WHERE overlap_bp >= 0.5 * a_length
+ AND overlap_bp >= 0.5 * b_length
bedtools closest
----------------
@@ -401,17 +373,15 @@ bedtools closest
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
+ ORDER BY peaks.name, nearest.distance
.. _closest-d:
@@ -426,31 +396,27 @@ bedtools closest
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.name AS peak,
- b.name AS gene,
- DISTANCE(a.interval, b.interval) AS distance
- FROM peaks a
- CROSS JOIN genes b
- WHERE a.chromosome = b.chromosome
- ORDER BY a.name, distance
- """)
+ SELECT
+ a.name AS peak,
+ b.name AS gene,
+ DISTANCE(a.interval, b.interval) AS distance
+ FROM peaks a
+ CROSS JOIN genes b
+ WHERE a.chrom = b.chrom
+ ORDER BY a.name, distance
Or using NEAREST for just the closest:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
- """)
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
``-s``: Same strand only
~~~~~~~~~~~~~~~~~~~~~~~~
@@ -463,22 +429,20 @@ Or using NEAREST for just the closest:
**GIQL:**
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=3,
- stranded=true
- ) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=3,
+ stranded=true
+ ) AS nearest
+ ORDER BY peaks.name, nearest.distance
bedtools cluster
----------------
@@ -496,15 +460,13 @@ Basic clustering
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
.. _cluster-d:
@@ -519,15 +481,13 @@ Basic clustering
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, 1000) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, 1000) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
``-s``: Strand-specific clustering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -540,15 +500,13 @@ Basic clustering
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, stranded=true) AS cluster_id
- FROM features
- ORDER BY chromosome, strand, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, stranded=true) AS cluster_id
+ FROM features
+ ORDER BY chrom, strand, start
bedtools merge
--------------
@@ -566,12 +524,10 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval)
- FROM features
- """)
+ SELECT MERGE(interval)
+ FROM features
.. _merge-d:
@@ -586,12 +542,10 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, 1000)
- FROM features
- """)
+ SELECT MERGE(interval, 1000)
+ FROM features
``-s``: Strand-specific merge
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -604,12 +558,10 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, stranded=true)
- FROM features
- """)
+ SELECT MERGE(interval, stranded=true)
+ FROM features
.. _merge-count:
@@ -624,14 +576,12 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- COUNT(*) AS feature_count
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ COUNT(*) AS feature_count
+ FROM features
``-c -o mean``: Average score
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -644,14 +594,12 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- AVG(score) AS avg_score
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ AVG(score) AS avg_score
+ FROM features
``-c -o collapse``: Collect names
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -664,14 +612,12 @@ Basic merge
**GIQL:**
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- STRING_AGG(name, ',') AS feature_names
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ STRING_AGG(name, ',') AS feature_names
+ FROM features
Key Differences from Bedtools
-----------------------------
diff --git a/docs/recipes/clustering-queries.rst b/docs/recipes/clustering-queries.rst
index 6ff1487..3dbd682 100644
--- a/docs/recipes/clustering-queries.rst
+++ b/docs/recipes/clustering-queries.rst
@@ -16,15 +16,13 @@ Assign Cluster IDs
Assign unique cluster IDs to groups of overlapping intervals:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
**Use case:** Group overlapping peaks or annotations for downstream analysis.
@@ -33,21 +31,19 @@ View Cluster Assignments
See which features belong to which cluster:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- cluster_id,
- chromosome,
- name,
- start_pos,
- end_pos
- FROM (
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM features
- )
- ORDER BY cluster_id, start_pos
- """)
+ SELECT
+ cluster_id,
+ chrom,
+ name,
+ start,
+ end
+ FROM (
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM features
+ )
+ ORDER BY cluster_id, start
**Use case:** Inspect clustering results to understand feature groupings.
@@ -59,15 +55,13 @@ Cluster with Gap Tolerance
Cluster intervals that are within a specified distance of each other:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, 1000) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, 1000) AS cluster_id
+ FROM features
+ ORDER BY chrom, start
**Use case:** Group nearby features even if they don't directly overlap
(e.g., cluster peaks within 1kb of each other).
@@ -77,22 +71,16 @@ Variable Distance Thresholds
Experiment with different clustering distances:
-.. code-block:: python
+.. code-block:: sql
- # Tight clustering (overlapping only)
- cursor = engine.execute("""
- SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features
- """)
+ -- Tight clustering (overlapping only)
+ SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features
- # Medium clustering (within 500bp)
- cursor = engine.execute("""
- SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features
- """)
+ -- Medium clustering (within 500bp)
+ SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features
- # Loose clustering (within 5kb)
- cursor = engine.execute("""
- SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features
- """)
+ -- Loose clustering (within 5kb)
+ SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features
**Use case:** Compare clustering at different resolutions for sensitivity analysis.
@@ -104,15 +92,13 @@ Cluster by Strand
Cluster intervals separately for each strand:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, stranded=true) AS cluster_id
- FROM features
- ORDER BY chromosome, strand, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, stranded=true) AS cluster_id
+ FROM features
+ ORDER BY chrom, strand, start
**Use case:** Maintain strand separation when clustering transcripts or
strand-specific regulatory elements.
@@ -122,15 +108,13 @@ Strand-Specific with Distance
Combine strand awareness with distance tolerance:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- *,
- CLUSTER(interval, 1000, stranded=true) AS cluster_id
- FROM features
- ORDER BY chromosome, strand, start_pos
- """)
+ SELECT
+ *,
+ CLUSTER(interval, 1000, stranded=true) AS cluster_id
+ FROM features
+ ORDER BY chrom, strand, start
**Use case:** Cluster nearby same-strand features while keeping opposite
strands separate.
@@ -143,23 +127,21 @@ Count Features per Cluster
Calculate how many features are in each cluster:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM features
- )
- SELECT
- cluster_id,
- chromosome,
- COUNT(*) AS feature_count,
- MIN(start_pos) AS cluster_start,
- MAX(end_pos) AS cluster_end
- FROM clustered
- GROUP BY cluster_id, chromosome
- ORDER BY chromosome, cluster_start
- """)
+ WITH clustered AS (
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM features
+ )
+ SELECT
+ cluster_id,
+ chrom,
+ COUNT(*) AS feature_count,
+ MIN(start) AS cluster_start,
+ MAX(end) AS cluster_end
+ FROM clustered
+ GROUP BY cluster_id, chrom
+ ORDER BY chrom, cluster_start
**Use case:** Identify cluster sizes and boundaries.
@@ -168,24 +150,22 @@ Filter by Cluster Size
Find clusters with a minimum number of features:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM features
- ),
- cluster_sizes AS (
- SELECT cluster_id, COUNT(*) AS size
- FROM clustered
- GROUP BY cluster_id
- )
- SELECT c.*
- FROM clustered c
- JOIN cluster_sizes s ON c.cluster_id = s.cluster_id
- WHERE s.size >= 3
- ORDER BY c.cluster_id, c.start_pos
- """)
+.. code-block:: sql
+
+ WITH clustered AS (
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM features
+ ),
+ cluster_sizes AS (
+ SELECT cluster_id, COUNT(*) AS size
+ FROM clustered
+ GROUP BY cluster_id
+ )
+ SELECT c.*
+ FROM clustered c
+ JOIN cluster_sizes s ON c.cluster_id = s.cluster_id
+ WHERE s.size >= 3
+ ORDER BY c.cluster_id, c.start
**Use case:** Focus on regions with multiple overlapping features (hotspots).
@@ -194,26 +174,24 @@ Cluster Summary Statistics
Calculate statistics for each cluster:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM features
- )
- SELECT
- cluster_id,
- chromosome,
- COUNT(*) AS feature_count,
- MIN(start_pos) AS cluster_start,
- MAX(end_pos) AS cluster_end,
- MAX(end_pos) - MIN(start_pos) AS cluster_span,
- AVG(score) AS avg_score,
- MAX(score) AS max_score
- FROM clustered
- GROUP BY cluster_id, chromosome
- ORDER BY feature_count DESC
- """)
+ WITH clustered AS (
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM features
+ )
+ SELECT
+ cluster_id,
+ chrom,
+ COUNT(*) AS feature_count,
+ MIN(start) AS cluster_start,
+ MAX(end) AS cluster_end,
+ MAX(end) - MIN(start) AS cluster_span,
+ AVG(score) AS avg_score,
+ MAX(score) AS max_score
+ FROM clustered
+ GROUP BY cluster_id, chrom
+ ORDER BY feature_count DESC
**Use case:** Rank clusters by size, span, or aggregate scores.
@@ -225,12 +203,10 @@ Merge Overlapping Intervals
Combine overlapping intervals into unified regions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval)
- FROM features
- """)
+ SELECT MERGE(interval)
+ FROM features
**Use case:** Create non-overlapping consensus regions from redundant annotations.
@@ -239,12 +215,10 @@ Merge with Distance
Merge intervals within a specified distance:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, 1000)
- FROM features
- """)
+ SELECT MERGE(interval, 1000)
+ FROM features
**Use case:** Create broader regions by joining nearby features.
@@ -253,12 +227,10 @@ Strand-Specific Merge
Merge intervals separately by strand:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT MERGE(interval, stranded=true)
- FROM features
- """)
+ SELECT MERGE(interval, stranded=true)
+ FROM features
**Use case:** Create strand-aware consensus regions.
@@ -270,14 +242,12 @@ Count Merged Features
Track how many features were merged into each region:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- COUNT(*) AS feature_count
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ COUNT(*) AS feature_count
+ FROM features
**Use case:** Understand the complexity of each merged region.
@@ -286,17 +256,15 @@ Aggregate Scores
Calculate statistics for merged regions:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- COUNT(*) AS feature_count,
- AVG(score) AS avg_score,
- MAX(score) AS max_score,
- SUM(score) AS total_score
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ COUNT(*) AS feature_count,
+ AVG(score) AS avg_score,
+ MAX(score) AS max_score,
+ SUM(score) AS total_score
+ FROM features
**Use case:** Summarize signal intensity across merged regions.
@@ -305,14 +273,12 @@ Collect Feature Names
List the names of features that were merged:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- MERGE(interval),
- STRING_AGG(name, ',') AS merged_features
- FROM features
- """)
+ SELECT
+ MERGE(interval),
+ STRING_AGG(name, ',') AS merged_features
+ FROM features
**Use case:** Track provenance of merged regions.
@@ -324,16 +290,14 @@ Total Base Pair Coverage
Calculate total genomic coverage after merging:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH merged AS (
- SELECT MERGE(interval)
- FROM features
- )
- SELECT SUM(end_pos - start_pos) AS total_coverage_bp
- FROM merged
- """)
+ WITH merged AS (
+ SELECT MERGE(interval)
+ FROM features
+ )
+ SELECT SUM(end - start) AS total_coverage_bp
+ FROM merged
**Use case:** Calculate the total genome fraction covered by features.
@@ -342,21 +306,19 @@ Coverage per Chromosome
Calculate coverage for each chromosome:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH merged AS (
- SELECT MERGE(interval)
- FROM features
- )
- SELECT
- chromosome,
- COUNT(*) AS region_count,
- SUM(end_pos - start_pos) AS coverage_bp
- FROM merged
- GROUP BY chromosome
- ORDER BY chromosome
- """)
+ WITH merged AS (
+ SELECT MERGE(interval)
+ FROM features
+ )
+ SELECT
+ chrom,
+ COUNT(*) AS region_count,
+ SUM(end - start) AS coverage_bp
+ FROM merged
+ GROUP BY chrom
+ ORDER BY chrom
**Use case:** Compare feature density across chromosomes.
@@ -365,29 +327,27 @@ Coverage Reduction
Compare raw vs merged coverage:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH raw_stats AS (
- SELECT
- COUNT(*) AS raw_count,
- SUM(end_pos - start_pos) AS raw_bp
- FROM features
- ),
- merged_stats AS (
- SELECT
- COUNT(*) AS merged_count,
- SUM(end_pos - start_pos) AS merged_bp
- FROM (SELECT MERGE(interval) FROM features)
- )
+.. code-block:: sql
+
+ WITH raw_stats AS (
+ SELECT
+ COUNT(*) AS raw_count,
+ SUM(end - start) AS raw_bp
+ FROM features
+ ),
+ merged_stats AS (
SELECT
- raw_count,
- merged_count,
- raw_bp,
- merged_bp,
- ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct
- FROM raw_stats, merged_stats
- """)
+ COUNT(*) AS merged_count,
+ SUM(end - start) AS merged_bp
+ FROM (SELECT MERGE(interval) FROM features)
+ )
+ SELECT
+ raw_count,
+ merged_count,
+ raw_bp,
+ merged_bp,
+ ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct
+ FROM raw_stats, merged_stats
**Use case:** Quantify the redundancy in your feature set.
@@ -399,24 +359,22 @@ Cluster Then Merge
First cluster features, then analyze each cluster:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH clustered AS (
- SELECT *, CLUSTER(interval) AS cluster_id
- FROM features
- )
- SELECT
- cluster_id,
- MIN(chromosome) AS chromosome,
- MIN(start_pos) AS start_pos,
- MAX(end_pos) AS end_pos,
- COUNT(*) AS feature_count,
- STRING_AGG(name, ',') AS features
- FROM clustered
- GROUP BY cluster_id
- ORDER BY chromosome, start_pos
- """)
+ WITH clustered AS (
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM features
+ )
+ SELECT
+ cluster_id,
+ MIN(chrom) AS chrom,
+ MIN(start) AS start,
+ MAX(end) AS end,
+ COUNT(*) AS feature_count,
+ STRING_AGG(name, ',') AS features
+ FROM clustered
+ GROUP BY cluster_id
+ ORDER BY chrom, start
**Use case:** Alternative to MERGE that preserves cluster identifiers.
@@ -425,26 +383,24 @@ Hierarchical Clustering
Apply multiple clustering levels:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH level1 AS (
- SELECT *, CLUSTER(interval, 0) AS cluster_l1
- FROM features
- ),
- level2 AS (
- SELECT *, CLUSTER(interval, 1000) AS cluster_l2
- FROM level1
- )
- SELECT
- cluster_l1,
- cluster_l2,
- chromosome,
- name,
- start_pos,
- end_pos
- FROM level2
- ORDER BY cluster_l2, cluster_l1, start_pos
- """)
+.. code-block:: sql
+
+ WITH level1 AS (
+ SELECT *, CLUSTER(interval, 0) AS cluster_l1
+ FROM features
+ ),
+ level2 AS (
+ SELECT *, CLUSTER(interval, 1000) AS cluster_l2
+ FROM level1
+ )
+ SELECT
+ cluster_l1,
+ cluster_l2,
+ chrom,
+ name,
+ start,
+ end
+ FROM level2
+ ORDER BY cluster_l2, cluster_l1, start
**Use case:** Analyze feature relationships at multiple scales.
diff --git a/docs/recipes/distance-queries.rst b/docs/recipes/distance-queries.rst
index 41f9ede..c71a4ee 100644
--- a/docs/recipes/distance-queries.rst
+++ b/docs/recipes/distance-queries.rst
@@ -16,24 +16,22 @@ Distance Between Feature Pairs
Calculate the distance between features in two tables:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.name AS feature_a,
- b.name AS feature_b,
- DISTANCE(a.interval, b.interval) AS distance
- FROM features_a a
- CROSS JOIN features_b b
- WHERE a.chromosome = b.chromosome
- ORDER BY a.name, distance
- """)
+ SELECT
+ a.name AS feature_a,
+ b.name AS feature_b,
+ DISTANCE(a.interval, b.interval) AS distance
+ FROM features_a a
+ CROSS JOIN features_b b
+ WHERE a.chrom = b.chrom
+ ORDER BY a.name, distance
**Use case:** Generate a distance matrix between regulatory elements and genes.
.. note::
- Always include ``WHERE a.chromosome = b.chromosome`` to avoid comparing
+ Always include ``WHERE a.chrom = b.chrom`` to avoid comparing
features on different chromosomes (which returns NULL for distance).
Identify Overlapping vs Proximal
@@ -41,23 +39,21 @@ Identify Overlapping vs Proximal
Classify relationships based on distance:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- p.name AS peak,
- g.name AS gene,
- DISTANCE(p.interval, g.interval) AS dist,
- CASE
- WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping'
- WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)'
- WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)'
- ELSE 'distant'
- END AS relationship
- FROM peaks p
- CROSS JOIN genes g
- WHERE p.chromosome = g.chromosome
- """)
+.. code-block:: sql
+
+ SELECT
+ p.name AS peak,
+ g.name AS gene,
+ DISTANCE(p.interval, g.interval) AS dist,
+ CASE
+ WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping'
+ WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)'
+ WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)'
+ ELSE 'distant'
+ END AS relationship
+ FROM peaks p
+ CROSS JOIN genes g
+ WHERE p.chrom = g.chrom
**Use case:** Categorize peak-gene relationships for enhancer analysis.
@@ -66,19 +62,17 @@ Filter by Maximum Distance
Find feature pairs within a distance threshold:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.name,
- b.name,
- DISTANCE(a.interval, b.interval) AS dist
- FROM features_a a
- CROSS JOIN features_b b
- WHERE a.chromosome = b.chromosome
- AND DISTANCE(a.interval, b.interval) <= 50000
- ORDER BY dist
- """)
+ SELECT
+ a.name,
+ b.name,
+ DISTANCE(a.interval, b.interval) AS dist
+ FROM features_a a
+ CROSS JOIN features_b b
+ WHERE a.chrom = b.chrom
+ AND DISTANCE(a.interval, b.interval) <= 50000
+ ORDER BY dist
**Use case:** Find regulatory elements within 50kb of genes.
@@ -90,17 +84,15 @@ Find K Nearest Features
For each peak, find the 3 nearest genes:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Use case:** Annotate ChIP-seq peaks with nearby genes.
@@ -109,13 +101,11 @@ Nearest Feature to a Specific Location
Find the 5 nearest genes to a specific genomic coordinate:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT name, distance
- FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5)
- ORDER BY distance
- """)
+ SELECT name, distance
+ FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5)
+ ORDER BY distance
**Use case:** Explore the genomic neighborhood of a position of interest.
@@ -124,22 +114,20 @@ Nearest with Distance Constraint
Find nearest features within a maximum distance:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=5,
- max_distance=100000
- ) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=5,
+ max_distance=100000
+ ) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Use case:** Find regulatory targets within 100kb, ignoring distant genes.
@@ -151,23 +139,21 @@ Same-Strand Nearest Neighbors
Find nearest features on the same strand only:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.strand,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=3,
- stranded=true
- ) AS nearest
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.strand,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=3,
+ stranded=true
+ ) AS nearest
+ ORDER BY peaks.name, nearest.distance
**Use case:** Find same-strand genes for strand-specific regulatory analysis.
@@ -179,23 +165,21 @@ Upstream Features
Find features upstream (5') of reference positions using signed distances:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=10,
- signed=true
- ) AS nearest
- WHERE nearest.distance < 0
- ORDER BY peaks.name, nearest.distance DESC
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=10,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance < 0
+ ORDER BY peaks.name, nearest.distance DESC
**Use case:** Find genes upstream of regulatory elements.
@@ -209,23 +193,21 @@ Downstream Features
Find features downstream (3') of reference positions:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=10,
- signed=true
- ) AS nearest
- WHERE nearest.distance > 0
- ORDER BY peaks.name, nearest.distance
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=10,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance > 0
+ ORDER BY peaks.name, nearest.distance
**Use case:** Identify downstream targets of promoter elements.
@@ -234,23 +216,21 @@ Promoter-Proximal Analysis
Find features within a specific distance window around the reference:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=10,
- signed=true
- ) AS nearest
- WHERE nearest.distance BETWEEN -2000 AND 500
- ORDER BY peaks.name, ABS(nearest.distance)
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=10,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance BETWEEN -2000 AND 500
+ ORDER BY peaks.name, ABS(nearest.distance)
**Use case:** Find genes with peaks in their promoter regions (-2kb to +500bp from TSS).
@@ -262,25 +242,23 @@ Strand-Specific with Distance Constraint
Find nearby same-strand features:
-.. code-block:: python
-
- cursor = engine.execute("""
- SELECT
- peaks.name AS peak,
- nearest.name AS gene,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(
- genes,
- reference=peaks.interval,
- k=5,
- max_distance=50000,
- stranded=true,
- signed=true
- ) AS nearest
- WHERE nearest.distance BETWEEN -10000 AND 10000
- ORDER BY peaks.name, ABS(nearest.distance)
- """)
+.. code-block:: sql
+
+ SELECT
+ peaks.name AS peak,
+ nearest.name AS gene,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes,
+ reference=peaks.interval,
+ k=5,
+ max_distance=50000,
+ stranded=true,
+ signed=true
+ ) AS nearest
+ WHERE nearest.distance BETWEEN -10000 AND 10000
+ ORDER BY peaks.name, ABS(nearest.distance)
**Use case:** Find same-strand genes within ±10kb for promoter-enhancer analysis.
@@ -292,23 +270,21 @@ Average Distance to Nearest Gene
Calculate the average distance from peaks to their nearest gene:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- WITH nearest_genes AS (
- SELECT
- peaks.name AS peak,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
- )
+ WITH nearest_genes AS (
SELECT
- COUNT(*) AS peak_count,
- AVG(distance) AS avg_distance,
- MIN(distance) AS min_distance,
- MAX(distance) AS max_distance
- FROM nearest_genes
- """)
+ peaks.name AS peak,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
+ )
+ SELECT
+ COUNT(*) AS peak_count,
+ AVG(distance) AS avg_distance,
+ MIN(distance) AS min_distance,
+ MAX(distance) AS max_distance
+ FROM nearest_genes
**Use case:** Characterize the genomic distribution of peaks relative to genes.
@@ -317,25 +293,23 @@ Distance Distribution by Chromosome
Analyze distance patterns per chromosome:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH nearest_genes AS (
- SELECT
- peaks.chromosome,
- peaks.name AS peak,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
- )
+.. code-block:: sql
+
+ WITH nearest_genes AS (
SELECT
- chromosome,
- COUNT(*) AS peak_count,
- AVG(distance) AS avg_distance
- FROM nearest_genes
- GROUP BY chromosome
- ORDER BY chromosome
- """)
+ peaks.chrom,
+ peaks.name AS peak,
+ nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
+ )
+ SELECT
+ chrom,
+ COUNT(*) AS peak_count,
+ AVG(distance) AS avg_distance
+ FROM nearest_genes
+ GROUP BY chrom
+ ORDER BY chrom
**Use case:** Compare regulatory element distribution across chromosomes.
@@ -347,26 +321,24 @@ Expand Search Window
Find features within an expanded window around each feature:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH expanded AS (
- SELECT
- name,
- chromosome,
- start_pos - 5000 AS search_start,
- end_pos + 5000 AS search_end
- FROM peaks
- )
+.. code-block:: sql
+
+ WITH expanded AS (
SELECT
- e.name AS peak,
- b.*
- FROM expanded e
- JOIN features_b b
- ON b.chromosome = e.chromosome
- AND b.start_pos < e.search_end
- AND b.end_pos > e.search_start
- """)
+ name,
+ chrom,
+ start - 5000 AS search_start,
+ end + 5000 AS search_end
+ FROM peaks
+ )
+ SELECT
+ e.name AS peak,
+ b.*
+ FROM expanded e
+ JOIN features_b b
+ ON b.chrom = e.chrom
+ AND b.start < e.search_end
+ AND b.end > e.search_start
**Use case:** Find all features within 5kb flanking regions.
diff --git a/docs/recipes/index.rst b/docs/recipes/index.rst
index f5d7a2c..5597846 100644
--- a/docs/recipes/index.rst
+++ b/docs/recipes/index.rst
@@ -11,34 +11,21 @@ using GIQL. Each recipe focuses on a specific use case with ready-to-use query p
Getting Started with Recipes
----------------------------
-All recipes assume you have set up a GIQL engine and registered your table schemas:
+All recipes show GIQL queries that you can transpile and execute on your database.
+Setup:
.. code-block:: python
- from giql import GIQLEngine
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- # Load your data
- engine.load_csv("features_a", "file_a.bed")
- engine.load_csv("features_b", "file_b.bed")
-
- # Register schemas with genomic column mapping
- for table in ["features_a", "features_b"]:
- engine.register_table_schema(
- table,
- {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "FLOAT",
- "strand": "VARCHAR",
- },
- genomic_column="interval",
- )
-
- # Now run queries from the recipes below
- cursor = engine.execute("...")
+ from giql import transpile
+
+ # Transpile any GIQL query to SQL
+ sql = transpile(
+ "... GIQL query from the recipes below ...",
+ tables=["features_a", "features_b"],
+ )
+
+ # Then execute the SQL on your database connection
+ # e.g., conn.execute(sql)
Recipe Categories
-----------------
diff --git a/docs/recipes/intersect-queries.rst b/docs/recipes/intersect-queries.rst
index fee0324..ef7c022 100644
--- a/docs/recipes/intersect-queries.rst
+++ b/docs/recipes/intersect-queries.rst
@@ -16,13 +16,11 @@ Basic Overlap Query
Find all features in table A that overlap with any feature in table B:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT DISTINCT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
**Use case:** Identify variants that fall within gene regions.
@@ -32,13 +30,11 @@ Get All Overlap Pairs
Return every pair of overlapping features (may produce duplicates if one
feature overlaps multiple others):
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT a.*, b.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
**Use case:** Generate a full overlap matrix for downstream analysis.
@@ -47,12 +43,10 @@ Query Against a Specific Region
Find features overlapping a literal genomic range:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT * FROM variants
- WHERE interval INTERSECTS 'chr1:1000000-2000000'
- """)
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000000-2000000'
**Use case:** Extract all data for a specific chromosomal region.
@@ -64,14 +58,12 @@ Excluding Overlaps
Find features in A that do NOT overlap with any feature in B:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- WHERE b.chromosome IS NULL
- """)
+ SELECT a.*
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ WHERE b.chrom IS NULL
**Use case:** Find regulatory regions that don't overlap with known genes,
or identify variants outside of exonic regions.
@@ -81,13 +73,11 @@ Features with Any Overlap (Unique)
Return each feature from A only once, regardless of how many B features it overlaps:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT DISTINCT a.*
- FROM features_a a
- INNER JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT DISTINCT a.*
+ FROM features_a a
+ INNER JOIN features_b b ON a.interval INTERSECTS b.interval
**Use case:** Get a deduplicated list of features that have at least one overlap.
@@ -99,14 +89,12 @@ Count Overlapping Features
Count how many B features each A feature overlaps:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, COUNT(b.name) AS overlap_count
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand
- """)
+ SELECT a.*, COUNT(b.name) AS overlap_count
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand
**Use case:** Calculate how many enhancers each gene overlaps with,
or count variants per feature.
@@ -116,15 +104,13 @@ Filter by Overlap Count
Find features that overlap at least N other features:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a
- INNER JOIN features_b b ON a.interval INTERSECTS b.interval
- GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand
- HAVING COUNT(*) >= 3
- """)
+ SELECT a.*
+ FROM features_a a
+ INNER JOIN features_b b ON a.interval INTERSECTS b.interval
+ GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand
+ HAVING COUNT(*) >= 3
**Use case:** Identify hotspot regions with high feature density.
@@ -136,14 +122,12 @@ Same-Strand Overlaps
Find overlapping features on the same strand:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.name AS b_name
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand = b.strand
- """)
+ SELECT a.*, b.name AS b_name
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND a.strand = b.strand
**Use case:** Find sense-strand overlaps for transcript analysis.
@@ -152,16 +136,14 @@ Opposite-Strand Overlaps
Find overlapping features on opposite strands:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.name AS b_name
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand != b.strand
- AND a.strand IN ('+', '-')
- AND b.strand IN ('+', '-')
- """)
+ SELECT a.*, b.name AS b_name
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND a.strand != b.strand
+ AND a.strand IN ('+', '-')
+ AND b.strand IN ('+', '-')
**Use case:** Identify antisense overlaps or convergent transcription.
@@ -173,16 +155,14 @@ Minimum Overlap Fraction of A
Find overlaps where at least 50% of feature A is covered:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND (
- LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
- ) >= 0.5 * (a.end_pos - a.start_pos)
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND (
+ LEAST(a.end, b.end) - GREATEST(a.start, b.start)
+ ) >= 0.5 * (a.end - a.start)
**Use case:** Ensure substantial overlap rather than just touching edges.
@@ -191,16 +171,14 @@ Minimum Overlap Fraction of B
Find overlaps where at least 50% of feature B is covered:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- AND (
- LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
- ) >= 0.5 * (b.end_pos - b.start_pos)
- """)
+ SELECT a.*
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ AND (
+ LEAST(a.end, b.end) - GREATEST(a.start, b.start)
+ ) >= 0.5 * (b.end - b.start)
**Use case:** Find features that substantially cover smaller annotations.
@@ -209,24 +187,22 @@ Reciprocal Overlap
Require both features to have at least 50% mutual overlap:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH overlap_calcs AS (
- SELECT
- a.*,
- b.name AS b_name,
- (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp,
- (a.end_pos - a.start_pos) AS a_length,
- (b.end_pos - b.start_pos) AS b_length
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- )
- SELECT *
- FROM overlap_calcs
- WHERE overlap_bp >= 0.5 * a_length
- AND overlap_bp >= 0.5 * b_length
- """)
+.. code-block:: sql
+
+ WITH overlap_calcs AS (
+ SELECT
+ a.*,
+ b.name AS b_name,
+ (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp,
+ (a.end - a.start) AS a_length,
+ (b.end - b.start) AS b_length
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
+ )
+ SELECT *
+ FROM overlap_calcs
+ WHERE overlap_bp >= 0.5 * a_length
+ AND overlap_bp >= 0.5 * b_length
**Use case:** Find high-confidence overlaps where features mutually cover each other.
@@ -238,13 +214,11 @@ Left Outer Join
Report all features from A, with B information where available:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT a.*, b.name AS overlapping_feature
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT a.*, b.name AS overlapping_feature
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
**Use case:** Annotate features with overlap information while keeping all records.
@@ -253,16 +227,14 @@ Calculate Overlap Amount
Return the overlap size in base pairs:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.*,
- b.name AS b_name,
- (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp
- FROM features_a a, features_b b
- WHERE a.interval INTERSECTS b.interval
- """)
+ SELECT
+ a.*,
+ b.name AS b_name,
+ (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp
+ FROM features_a a, features_b b
+ WHERE a.interval INTERSECTS b.interval
**Use case:** Quantify the extent of each overlap.
@@ -271,19 +243,17 @@ Overlap with NULL Handling
Report overlap amount for all A features, with 0 for non-overlapping:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT
- a.*,
- b.name AS b_name,
- CASE
- WHEN b.chromosome IS NULL THEN 0
- ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
- END AS overlap_bp
- FROM features_a a
- LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
- """)
+ SELECT
+ a.*,
+ b.name AS b_name,
+ CASE
+ WHEN b.chrom IS NULL THEN 0
+ ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start)
+ END AS overlap_bp
+ FROM features_a a
+ LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
**Use case:** Create a complete overlap report including non-overlapping features.
@@ -295,26 +265,18 @@ Union Multiple Sources
Intersect A with features from multiple B tables:
-.. code-block:: python
-
- # Load and register multiple tables first
- engine.load_csv("features_b1", "file1.bed")
- engine.load_csv("features_b2", "file2.bed")
- engine.load_csv("features_b3", "file3.bed")
- # Register schemas for each...
-
- cursor = engine.execute("""
- WITH all_b_features AS (
- SELECT * FROM features_b1
- UNION ALL
- SELECT * FROM features_b2
- UNION ALL
- SELECT * FROM features_b3
- )
- SELECT DISTINCT a.*
- FROM features_a a
- INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval
- """)
+.. code-block:: sql
+
+ WITH all_b_features AS (
+ SELECT * FROM features_b1
+ UNION ALL
+ SELECT * FROM features_b2
+ UNION ALL
+ SELECT * FROM features_b3
+ )
+ SELECT DISTINCT a.*
+ FROM features_a a
+ INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval
**Use case:** Find features overlapping any region from multiple annotation sources.
@@ -323,20 +285,18 @@ Track Overlap Source
Know which source table each overlap came from:
-.. code-block:: python
-
- cursor = engine.execute("""
- WITH all_b_features AS (
- SELECT *, 'source1' AS source FROM features_b1
- UNION ALL
- SELECT *, 'source2' AS source FROM features_b2
- UNION ALL
- SELECT *, 'source3' AS source FROM features_b3
- )
- SELECT a.*, b.name AS overlap_name, b.source
- FROM features_a a
- INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval
- """)
+.. code-block:: sql
+
+ WITH all_b_features AS (
+ SELECT *, 'source1' AS source FROM features_b1
+ UNION ALL
+ SELECT *, 'source2' AS source FROM features_b2
+ UNION ALL
+ SELECT *, 'source3' AS source FROM features_b3
+ )
+ SELECT a.*, b.name AS overlap_name, b.source
+ FROM features_a a
+ INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval
**Use case:** Track which annotation database each overlap originated from.
@@ -348,16 +308,14 @@ Overlap with Quality Filters
Combine spatial and attribute filters:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE v.quality >= 30
- AND g.biotype = 'protein_coding'
- ORDER BY v.chromosome, v.start_pos
- """)
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE v.quality >= 30
+ AND g.biotype = 'protein_coding'
+ ORDER BY v.chrom, v.start
**Use case:** Find high-quality variants in protein-coding genes.
@@ -366,14 +324,12 @@ Specific Target Genes
Find overlaps with a specific set of genes:
-.. code-block:: python
+.. code-block:: sql
- cursor = engine.execute("""
- SELECT v.*, g.name AS gene_name
- FROM variants v
- INNER JOIN genes g ON v.interval INTERSECTS g.interval
- WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR')
- ORDER BY g.name, v.start_pos
- """)
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ INNER JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR')
+ ORDER BY g.name, v.start
**Use case:** Extract variants in clinically relevant genes.
diff --git a/docs/transpilation/api-reference.rst b/docs/transpilation/api-reference.rst
new file mode 100644
index 0000000..fcba984
--- /dev/null
+++ b/docs/transpilation/api-reference.rst
@@ -0,0 +1,13 @@
+API Reference
+=============
+
+.. currentmodule:: giql
+
+.. autosummary::
+
+ transpile
+ Table
+
+.. autofunction:: transpile
+
+.. autoclass:: Table
diff --git a/docs/transpilation/execution.rst b/docs/transpilation/execution.rst
new file mode 100644
index 0000000..72ea9de
--- /dev/null
+++ b/docs/transpilation/execution.rst
@@ -0,0 +1,152 @@
+Execution
+=========
+
+How to use transpiled SQL
+-------------------------
+
+You can write queries in the GIQL dialect and execute them on any SQL-92
+compliant database or analytics engine, without needing native GIQL support.
+
+With external database connections
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use transpiled SQL with your own database connections:
+
+.. code-block:: python
+
+ import duckdb
+ from giql import transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
+
+ conn = duckdb.connect("my_database.duckdb")
+ result = conn.execute(sql).fetchall()
+ conn.close()
+
+With ORMs and query builders
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Integrate transpiled SQL with SQLAlchemy or other ORMs:
+
+.. code-block:: python
+
+ from sqlalchemy import create_engine, text
+ from giql import transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
+
+ engine = create_engine("duckdb:///my_database.duckdb")
+ with engine.connect() as conn:
+ result = conn.execute(text(sql))
+ for row in result:
+ print(row)
+
+Building SQL pipelines
+~~~~~~~~~~~~~~~~~~~~~~
+
+Use transpilation in data pipelines:
+
+.. code-block:: python
+
+ from giql import transpile
+
+ def build_intersection_query(table_a, table_b, region):
+ """Generate SQL for intersection query."""
+ return transpile(
+ f"""
+ SELECT a.*, b.name
+ FROM {table_a} a
+ JOIN {table_b} b ON a.interval INTERSECTS b.interval
+ WHERE a.interval INTERSECTS '{region}'
+ """,
+ tables=[table_a, table_b],
+ )
+
+ # Use in pipeline
+ sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000")
+ # Execute sql with your preferred method
+
+Saving queries
+~~~~~~~~~~~~~~
+
+Save transpiled SQL for documentation or reuse:
+
+.. code-block:: python
+
+ import duckdb
+ from giql import transpile
+
+ sql = transpile(
+ """
+ SELECT * FROM variants
+ WHERE interval INTERSECTS 'chr1:1000-2000'
+ """,
+ tables=["variants"],
+ )
+
+ with open("query.sql", "w") as f:
+ f.write(sql)
+
+ # Later, execute saved SQL
+ with open("query.sql") as f:
+ sql = f.read()
+
+ conn = duckdb.connect("database.duckdb")
+ result = conn.execute(sql).fetchall()
+
+Parameterized queries
+~~~~~~~~~~~~~~~~~~~~~
+
+Build queries with parameters:
+
+.. code-block:: python
+
+ from giql import transpile
+
+ def query_region(chrom, start, end):
+ """Transpile a parameterized region query."""
+ region = f"{chrom}:{start}-{end}"
+ return transpile(
+ f"""
+ SELECT * FROM variants
+ WHERE interval INTERSECTS '{region}'
+ """,
+ tables=["variants"],
+ )
+
+ # Use with different regions
+ sql = query_region("chr1", 1000000, 2000000)
+ sql = query_region("chr2", 5000000, 6000000)
+
+Dynamic query building
+~~~~~~~~~~~~~~~~~~~~~~
+
+Build queries programmatically:
+
+.. code-block:: python
+
+ from giql import transpile
+
+ def build_multi_table_query(tables, target_region):
+ """Build a query that unions results from multiple tables."""
+ union_parts = []
+ for table in tables:
+ union_parts.append(f"""
+ SELECT *, '{table}' AS source FROM {table}
+ WHERE interval INTERSECTS '{target_region}'
+ """)
+
+ query = " UNION ALL ".join(union_parts)
+ return transpile(query, tables=list(tables))
diff --git a/docs/transpilation/index.rst b/docs/transpilation/index.rst
new file mode 100644
index 0000000..e5e743b
--- /dev/null
+++ b/docs/transpilation/index.rst
@@ -0,0 +1,210 @@
+Transpilation
+=============
+
+The ``giql`` Python package transpiles GIQL into SQL.
+
+How it works
+------------
+
+When you do this:
+
+.. code-block:: python
+
+ from giql import transpile
+
+ sql = transpile(
+ "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["variants"],
+ )
+
+ print(sql)
+
+The transpiler performs three main steps:
+
+1. **Parses** the GIQL query into an abstract syntax tree (AST) to identify GIQL-specific operators
+2. **Transforms** genomic operators into SQL predicates and Common Table Expressions (CTEs), and replace genomic pseudo-columns with actual column references
+3. **Generates** SQL output from the modified AST
+
+The result is a standard SQL query that can be consumed by an execution engine that is not genome-aware.
+
+.. code-block:: sql
+
+ SELECT * FROM variants
+ WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000
+
+
+Examples
+--------
+
+Each GIQL operator expands to specific SQL patterns.
+
+**INTERSECTS** expands to range overlap checks:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ a.interval INTERSECTS b.interval
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ a."chrom" = b."chrom"
+ AND a."start" < b."end"
+ AND a."end" > b."start"
+
+**CONTAINS** expands to containment checks:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ a.interval CONTAINS b.interval
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ a."chrom" = b."chrom"
+ AND a."start" <= b."start"
+ AND a."end" >= b."end"
+
+**DISTANCE** expands to gap calculations:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ DISTANCE(a.interval, b.interval)
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ CASE
+ WHEN a."chrom" != b."chrom" THEN NULL
+ WHEN a."end" <= b."start" THEN b."start" - a."end"
+ WHEN b."end" <= a."start" THEN a."start" - b."end"
+ ELSE 0
+ END
+
+**Intersection joins** expand to inequality joins:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ SELECT v.*, g.name AS gene_name
+ FROM variants v
+ JOIN genes g ON v.interval INTERSECTS g.interval
+ WHERE v.quality >= 30
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ SELECT v.*, g.name AS gene_name
+ FROM variants AS v
+ JOIN genes AS g
+ ON v."chrom" = g."chrom"
+ AND v."start" < g."end"
+ AND v."end" > g."start"
+ WHERE v.quality >= 30
+
+**NEAREST** expands to lateral subqueries:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ SELECT peaks.name, nearest.name, nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(
+ genes, reference=peaks.interval, k=5
+ ) AS nearest
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ SELECT peaks.name, nearest.name, nearest.distance
+ FROM peaks
+ CROSS JOIN LATERAL (
+ SELECT
+ genes.*,
+ CASE
+ WHEN peaks."chrom" != genes."chrom" THEN NULL
+ WHEN peaks."start" < genes."end"
+ AND peaks."end" > genes."start" THEN 0
+ WHEN peaks."end" <= genes."start"
+ THEN genes."start" - peaks."end"
+ ELSE peaks."start" - genes."end"
+ END AS distance
+ FROM genes
+ WHERE peaks."chrom" = genes."chrom"
+ ORDER BY ABS(
+ CASE
+ WHEN peaks."chrom" != genes."chrom" THEN NULL
+ WHEN peaks."start" < genes."end"
+ AND peaks."end" > genes."start" THEN 0
+ WHEN peaks."end" <= genes."start"
+ THEN genes."start" - peaks."end"
+ ELSE peaks."start" - genes."end"
+ END
+ )
+ LIMIT 5
+ ) AS nearest
+
+**MERGE** expands to window-function-based clustering:
+
+.. tab-set::
+
+ .. tab-item:: GIQL
+
+ .. code-block:: sql
+
+ SELECT MERGE(interval), COUNT(*) AS count
+ FROM features
+
+ .. tab-item:: SQL
+
+ .. code-block:: sql
+
+ SELECT
+ "chrom",
+ MIN("start") AS start,
+ MAX("end") AS end,
+ COUNT(*) AS count
+ FROM (
+ SELECT
+ *,
+ SUM(is_new_cluster) OVER (
+ PARTITION BY "chrom"
+ ORDER BY "start" NULLS LAST
+ ) AS __giql_cluster_id
+ FROM (
+ SELECT
+ *,
+ CASE
+ WHEN LAG("end") OVER (
+ PARTITION BY "chrom"
+ ORDER BY "start" NULLS LAST
+ ) >= "start" THEN 0
+ ELSE 1
+ END AS is_new_cluster
+ FROM features
+ ) AS lag_calc
+ ) AS clustered
+ GROUP BY chrom, __giql_cluster_id
+ ORDER BY "chrom" NULLS LAST, "start" NULLS LAST
diff --git a/pyproject.toml b/pyproject.toml
index 25874b1..59d41a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,13 +15,7 @@ authors = [
{ name = "Conrad Bzura", email = "conradbzura@gmail.com" },
]
dependencies = [
- "click>=8.3.0",
- "duckdb>=1.4.0",
- "oxbow>=0.4.0",
- "pandas>=2.0.0",
- "psycopg2-binary>=2.9.10",
"sqlglot>=20.0.0",
- "sqlparse>=0.4.0",
]
description = "Genomic Interval Query Language - SQL dialect for genomic range queries"
dynamic = ["version"]
@@ -33,20 +27,22 @@ name = "giql"
readme = "README.md"
requires-python = ">=3.11"
-[project.scripts]
-giql = "giql.cli:cli"
-
-[project.optional-dependencies]
-all = [
- "duckdb>=0.9.0",
- "mysql-connector-python>=8.0.0",
- "psycopg2-binary>=2.9.0",
+[dependency-groups]
+dev = [
+ "duckdb>=1.4.0",
+ "hypothesis>=6.0.0",
+ "pandas>=2.0.0",
+ "pybedtools>=0.9.0",
+ "pytest-cov>=4.0.0",
+ "pytest>=7.0.0",
+ "ruff>=0.1.0",
+]
+docs = [
+ "sphinx>=7.0",
+ "sphinx-autobuild>=2024.0",
+ "sphinx-book-theme>=1.1",
+ "sphinx-design>=0.6",
]
-dev = ["pytest-cov>=4.0.0", "pytest>=7.0.0", "ruff>=0.1.0", "hypothesis", "pybedtools"]
-duckdb = ["duckdb>=0.9.0"]
-mysql = ["mysql-connector-python>=8.0.0"]
-postgres = ["psycopg2-binary>=2.9.0"]
-sqlite = []
[tool.hatch.metadata.hooks.custom]
path = "build-hooks/metadata.py"
@@ -79,13 +75,8 @@ bedtools = ">=2.31.0"
pybedtools = ">=0.9.0"
pytest = ">=7.0.0"
pytest-cov = ">=4.0.0"
-click = ">=8.3.0"
duckdb = ">=1.4.0"
pandas = ">=2.0.0"
-pyarrow = ">=19.0.0"
-psycopg2-binary = ">=2.9.10"
sqlglot = ">=20.0.0"
pip = "*"
-oxbow = ">=0.4.0"
-sqlparse = ">=0.4.0"
hypothesis = ">=6.148.2,<7"
diff --git a/src/giql/__init__.py b/src/giql/__init__.py
index e840f17..71e895d 100644
--- a/src/giql/__init__.py
+++ b/src/giql/__init__.py
@@ -1,19 +1,15 @@
"""GIQL - Genomic Interval Query Language.
-A SQL dialect for genomic range queries with multi-database support.
-
-This package provides:
- - GIQL dialect extending SQL with spatial operators
- - Query engine supporting multiple backends (DuckDB, SQLite)
- - Range parser for genomic coordinate strings
- - Schema management for genomic data
+A SQL dialect for genomic range queries.
"""
-from giql.engine import GIQLEngine as GIQLEngine
+from giql.table import Table
+from giql.transpile import transpile
__version__ = "0.1.0"
__all__ = [
- "GIQLEngine",
+ "Table",
+ "transpile",
]
diff --git a/src/giql/cli.py b/src/giql/cli.py
deleted file mode 100644
index d714075..0000000
--- a/src/giql/cli.py
+++ /dev/null
@@ -1,683 +0,0 @@
-"""Command-line interface for GIQL.
-
-This module provides a CLI that mirrors bedtools intersect functionality
-using GIQL's genomic query capabilities.
-"""
-
-import sys
-from pathlib import Path
-
-import click
-import duckdb
-from oxbow import from_bam
-from oxbow import from_bed
-from oxbow import from_gff
-from oxbow import from_gtf
-from oxbow import from_vcf
-
-from giql import GIQLEngine
-
-
-@click.group()
-@click.version_option()
-def cli():
- """GIQL - Genomic Interval Query Language.
-
- SQL-based toolkit for genomic range queries.
- """
- pass
-
-
-def _detect_file_format(file_path: Path) -> str:
- """Detect genomic file format from file extension.
-
- :param file_path:
- Path to the file
- :return:
- Format identifier: 'bed', 'bam', 'vcf', 'gff', 'gtf'
- :raises click.ClickException:
- If format cannot be determined
- """
- # Handle compressed files
- suffixes = file_path.suffixes
- if suffixes[-1] == ".gz":
- # Remove .gz and check the actual format
- ext = suffixes[-2] if len(suffixes) >= 2 else ""
- else:
- ext = file_path.suffix
-
- ext = ext.lower()
-
- format_map = {
- ".bed": "bed",
- ".bam": "bam",
- ".vcf": "vcf",
- ".gff": "gff",
- ".gff3": "gff",
- ".gtf": "gtf",
- }
-
- if ext in format_map:
- return format_map[ext]
-
- raise click.ClickException(
- f"Unsupported file format: {ext}. Supported formats: BED, BAM, VCF, GFF, GTF"
- )
-
-
-def _load_genomic_file(
- conn: duckdb.DuckDBPyConnection, file_path: Path, table_name: str
-) -> dict[str, str]:
- """Load genomic file using appropriate oxbow function.
-
- :param conn:
- DuckDB connection
- :param file_path:
- Path to genomic file
- :param table_name:
- Name for the table to create
- :return:
- Dictionary mapping column names to types
- :raises click.ClickException:
- If file cannot be loaded
- """
- fmt = _detect_file_format(file_path)
- compression = "gzip" if file_path.suffix == ".gz" else None
-
- try:
- match fmt:
- case "bed":
- df = from_bed(str(file_path), compression=compression).to_duckdb(conn)
- case "bam":
- df = from_bam(str(file_path)).to_duckdb(conn)
- case "vcf":
- df = from_vcf(str(file_path), compression=compression).to_duckdb(conn)
- case "gff":
- df = from_gff(str(file_path), compression=compression).to_duckdb(conn)
- case "gtf":
- df = from_gtf(str(file_path), compression=compression).to_duckdb(conn)
- case _:
- raise click.ClickException(f"Unsupported format: {fmt}")
-
- conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
-
- # Get column information
- col_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
- return {col[0]: col[1] for col in col_info}
-
- except Exception as e:
- raise click.ClickException(f"Failed to load {file_path}: {e}")
-
-
-def _expand_rest_columns(df):
- """Expand 'rest' columns from BED files into separate columns.
-
- BED files store extra fields beyond chrom/start/end in a 'rest' column
- as a tab-delimited string. This function expands those into separate columns
- to match bedtools output format.
-
- :param df:
- DataFrame with potential 'rest' columns
- :return:
- DataFrame with rest columns expanded
- """
- import pandas as pd
-
- # pandas.read_sql can return duplicate column names when joining
- # Find all 'rest' column positions
- rest_indices = [i for i, col in enumerate(df.columns) if col == "rest"]
-
- if not rest_indices:
- return df
-
- # Build new dataframe with expanded columns
- # We need to handle duplicate column names, so we can't use a dict
- new_data = {}
- new_col_names = []
-
- for i, col in enumerate(df.columns):
- if col == "rest" and i in rest_indices:
- # Expand this rest column
- col_data = df.iloc[:, i]
- expanded = col_data.fillna("").astype(str).str.split("\t", expand=True)
-
- # Add all expanded columns with unique names
- for j in range(expanded.shape[1]):
- col_name = f"field_{j + 4}"
- # Make unique if duplicate
- base_name = col_name
- counter = 0
- while col_name in new_col_names:
- counter += 1
- col_name = f"{base_name}_{counter}"
- new_col_names.append(col_name)
- new_data[col_name] = expanded[j]
- else:
- # Keep non-rest columns as-is
- # Make unique names for duplicates
- col_name = col
- base_name = col_name
- counter = 0
- while col_name in new_col_names:
- counter += 1
- col_name = f"{base_name}_{counter}"
- new_col_names.append(col_name)
- new_data[col_name] = df.iloc[:, i]
-
- # Rebuild dataframe with explicit column order
- result = pd.DataFrame(new_data, columns=new_col_names)
- return result
-
-
-def _detect_genomic_columns(columns: dict[str, str]) -> dict[str, str | None]:
- """Detect genomic coordinate columns from available columns.
-
- :param columns:
- Dictionary of column name -> type
- :return:
- Dictionary with keys: chrom_col, start_col, end_col, strand_col
- """
- col_names = {c.lower(): c for c in columns.keys()}
-
- # Chromosome column patterns (in priority order)
- chrom_col = None
- for pattern in ["chrom", "seqid", "chr", "chromosome", "contig", "seqname"]:
- if pattern in col_names:
- chrom_col = col_names[pattern]
- break
-
- # Start column patterns
- start_col = None
- for pattern in [
- "start",
- "chromstart",
- "pos",
- "begin",
- "txstart",
- "cdsstart",
- "thickstart",
- ]:
- if pattern in col_names:
- start_col = col_names[pattern]
- break
-
- # End column patterns
- end_col = None
- for pattern in [
- "end",
- "chromend",
- "stop",
- "txend",
- "cdsend",
- "thickend",
- ]:
- if pattern in col_names:
- end_col = col_names[pattern]
- break
-
- # Strand column patterns
- strand_col = None
- for pattern in ["strand", "str", "orientation"]:
- if pattern in col_names:
- strand_col = col_names[pattern]
- break
-
- return {
- "chrom_col": chrom_col,
- "start_col": start_col,
- "end_col": end_col,
- "strand_col": strand_col,
- }
-
-
-@cli.command()
-@click.option(
- "-a",
- "--file-a",
- required=True,
- type=click.Path(exists=True),
- help="BAM/BED/GFF/VCF file 'A'. Each feature in A is compared to B.",
-)
-@click.option(
- "-b",
- "--file-b",
- required=True,
- multiple=True,
- type=click.Path(exists=True),
- help="One or more BAM/BED/GFF/VCF files for comparison.",
-)
-@click.option(
- "-wa",
- "--write-a",
- is_flag=True,
- help="Write the original entry in A for each overlap.",
-)
-@click.option(
- "-wb",
- "--write-b",
- is_flag=True,
- help="Write the original entry in B for each overlap.",
-)
-@click.option(
- "-loj",
- "--left-outer-join",
- is_flag=True,
- help="Perform left outer join. Report all A features with NULL B when no overlap.",
-)
-@click.option(
- "-wo",
- "--write-overlap",
- is_flag=True,
- help="Write the number of overlapping base pairs between features.",
-)
-@click.option(
- "-wao",
- "--write-all-overlap",
- is_flag=True,
- help="Like -wo but includes A features with zero overlap.",
-)
-@click.option(
- "-u",
- "--unique",
- is_flag=True,
- help="Report each A feature only once if any overlap exists in B.",
-)
-@click.option(
- "-c",
- "--count",
- is_flag=True,
- help="For each entry in A, report the number of overlaps in B.",
-)
-@click.option(
- "-v",
- "--invert",
- is_flag=True,
- help="Only report entries in A that have no overlap in B.",
-)
-@click.option(
- "-f",
- "--fraction-a",
- type=float,
- help="Minimum overlap as fraction of A.",
-)
-@click.option(
- "-F",
- "--fraction-b",
- type=float,
- help="Minimum overlap as fraction of B.",
-)
-@click.option(
- "-r",
- "--reciprocal",
- is_flag=True,
- help="Require reciprocal overlap fraction for both A and B.",
-)
-@click.option(
- "-e",
- "--either",
- is_flag=True,
- help="Require that -f OR -F be satisfied (not both).",
-)
-@click.option(
- "-s",
- "--same-strand",
- is_flag=True,
- help="Require same strand for overlaps.",
-)
-@click.option(
- "-S",
- "--opposite-strand",
- is_flag=True,
- help="Require opposite strand for overlaps.",
-)
-@click.option(
- "--header",
- is_flag=True,
- help="Print the header from A before results.",
-)
-@click.option(
- "--names",
- multiple=True,
- help="Aliases for B files (instead of file numbers).",
-)
-@click.option(
- "-sorted",
- "--sorted-input",
- is_flag=True,
- help="For compatibility with bedtools (currently ignored).",
-)
-@click.option(
- "--chunksize",
- type=int,
- help="Process results in chunks of N rows (streaming mode for large datasets).",
-)
-def intersect(
- file_a,
- file_b,
- write_a,
- write_b,
- left_outer_join,
- write_overlap,
- write_all_overlap,
- unique,
- count,
- invert,
- fraction_a,
- fraction_b,
- reciprocal,
- either,
- same_strand,
- opposite_strand,
- header,
- names,
- sorted_input,
- chunksize,
-):
- """Find overlaps between genomic features.
-
- Similar to bedtools intersect, this command finds overlapping intervals
- between files A and B using GIQL's spatial operators.
-
- Supports BED, BAM, VCF, GFF, and GTF formats (gzip compressed or uncompressed).
- """
- # Validate conflicting options
- if same_strand and opposite_strand:
- raise click.UsageError("Cannot use -s and -S together")
-
- output_modes = [
- write_a,
- write_b,
- left_outer_join,
- write_overlap,
- write_all_overlap,
- unique,
- count,
- invert,
- ]
- if sum(output_modes) > 1:
- raise click.UsageError("Can only specify one output mode")
-
- # Create DuckDB connection
- conn = duckdb.connect()
-
- # Initialize engine with existing connection
- engine = GIQLEngine(target_dialect="duckdb", connection=conn)
-
- try:
- # Load file A
- file_a_path = Path(file_a)
- table_a = "file_a"
- columns_a = _load_genomic_file(conn, file_a_path, table_a)
-
- # Detect genomic columns
- genomic_cols_a = _detect_genomic_columns(columns_a)
-
- if not all(
- [
- genomic_cols_a["chrom_col"],
- genomic_cols_a["start_col"],
- genomic_cols_a["end_col"],
- ]
- ):
- raise click.ClickException(
- f"Could not detect genomic columns in {file_a}. "
- f"Found columns: {list(columns_a.keys())}"
- )
-
- # Register schema for file A
- engine.register_table_schema(
- table_a,
- columns_a,
- genomic_column="interval",
- chrom_col=genomic_cols_a["chrom_col"],
- start_col=genomic_cols_a["start_col"],
- end_col=genomic_cols_a["end_col"],
- strand_col=genomic_cols_a["strand_col"],
- )
-
- # Process file(s) B
- results = []
- for idx, b_file in enumerate(file_b):
- b_path = Path(b_file)
- table_b = f"file_b_{idx}"
-
- # Load file B
- columns_b = _load_genomic_file(conn, b_path, table_b)
-
- # Detect genomic columns in B
- genomic_cols_b = _detect_genomic_columns(columns_b)
-
- if not all(
- [
- genomic_cols_b["chrom_col"],
- genomic_cols_b["start_col"],
- genomic_cols_b["end_col"],
- ]
- ):
- raise click.ClickException(
- f"Could not detect genomic columns in {b_file}"
- )
-
- # Register schema for file B
- engine.register_table_schema(
- table_b,
- columns_b,
- genomic_column="region",
- chrom_col=genomic_cols_b["chrom_col"],
- start_col=genomic_cols_b["start_col"],
- end_col=genomic_cols_b["end_col"],
- strand_col=genomic_cols_b["strand_col"],
- )
-
- # Build query based on options
- query = _build_intersect_query(
- table_a=table_a,
- table_b=table_b,
- chrom_a=genomic_cols_a["chrom_col"],
- start_a=genomic_cols_a["start_col"],
- end_a=genomic_cols_a["end_col"],
- strand_a=genomic_cols_a["strand_col"],
- chrom_b=genomic_cols_b["chrom_col"],
- start_b=genomic_cols_b["start_col"],
- end_b=genomic_cols_b["end_col"],
- strand_b=genomic_cols_b["strand_col"],
- write_a=write_a,
- write_b=write_b,
- left_outer_join=left_outer_join,
- write_overlap=write_overlap,
- write_all_overlap=write_all_overlap,
- unique=unique,
- count=count,
- invert=invert,
- same_strand=same_strand,
- opposite_strand=opposite_strand,
- fraction_a=fraction_a,
- fraction_b=fraction_b,
- reciprocal=reciprocal,
- either=either,
- )
-
- # Execute query and get cursor
- cursor = engine.execute(query)
-
- # Get column names
- col_names = [desc[0] for desc in cursor.description]
-
- # Output header if requested (only once, before first row)
- if header and idx == 0:
- print("\t".join(col_names))
-
- # Stream results row by row
- while True:
- row = cursor.fetchone()
- if row is None:
- break
- # Expand rest columns inline
- output_fields = []
- for i, value in enumerate(row):
- col_name = col_names[i]
- if col_name == "rest" and value:
- # Expand rest column - split on tabs
- rest_fields = str(value).split("\t")
- output_fields.extend(rest_fields)
- else:
- output_fields.append(str(value) if value is not None else "")
-
- # Add file identifier if needed
- if names and idx < len(names):
- output_fields.append(names[idx])
- elif len(file_b) > 1:
- output_fields.append(b_path.name)
-
- # Output row as TSV
- print("\t".join(output_fields))
-
- finally:
- engine.close()
-
-
-def _build_intersect_query(
- table_a: str,
- table_b: str,
- chrom_a: str,
- start_a: str,
- end_a: str,
- strand_a: str | None,
- chrom_b: str,
- start_b: str,
- end_b: str,
- strand_b: str | None,
- write_a: bool = False,
- write_b: bool = False,
- left_outer_join: bool = False,
- write_overlap: bool = False,
- write_all_overlap: bool = False,
- unique: bool = False,
- count: bool = False,
- invert: bool = False,
- same_strand: bool = False,
- opposite_strand: bool = False,
- fraction_a: float | None = None,
- fraction_b: float | None = None,
- reciprocal: bool = False,
- either: bool = False,
-) -> str:
- """Build GIQL query based on intersect options."""
-
- # Build strand filter if needed
- strand_filter = ""
- if same_strand and strand_a and strand_b:
- strand_filter = f' AND a."{strand_a}" = b."{strand_b}"'
- elif opposite_strand and strand_a and strand_b:
- strand_filter = f' AND a."{strand_a}" != b."{strand_b}"'
-
- # Build fraction filter if needed
- fraction_filter = ""
- if fraction_a or fraction_b:
- filters = []
-
- if fraction_a:
- # Overlap must be at least fraction_a of A's length
- overlap_expr = (
- f'LEAST(a."{end_a}", b."{end_b}") - '
- f'GREATEST(a."{start_a}", b."{start_b}")'
- )
- a_length = f'(a."{end_a}" - a."{start_a}")'
- filters.append(f"({overlap_expr}::FLOAT / {a_length} >= {fraction_a})")
-
- if fraction_b:
- # Overlap must be at least fraction_b of B's length
- overlap_expr = (
- f'LEAST(a."{end_a}", b."{end_b}") - '
- f'GREATEST(a."{start_a}", b."{start_b}")'
- )
- b_length = f'(b."{end_b}" - b."{start_b}")'
- filters.append(f"({overlap_expr}::FLOAT / {b_length} >= {fraction_b})")
-
- # Combine filters based on reciprocal/either flags
- if reciprocal and len(filters) == 2:
- # Both must be satisfied (AND)
- fraction_filter = f" AND ({filters[0]} AND {filters[1]})"
- elif either and len(filters) == 2:
- # Either must be satisfied (OR)
- fraction_filter = f" AND ({filters[0]} OR {filters[1]})"
- elif filters:
- # Just one filter or default behavior
- fraction_filter = f" AND {' AND '.join(filters)}"
-
- if invert:
- # Only features in A with no overlap in B
- where_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}"
- return f"""
- SELECT a.*
- FROM {table_a} a
- WHERE NOT EXISTS (
- SELECT 1 FROM {table_b} b
- WHERE {where_clause}
- )
- """
-
- if count:
- # Count overlaps
- # Get all columns from table A for GROUP BY
- on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}"
- return f"""
- SELECT a.*, COUNT(b.\"{chrom_b}\") as overlap_count
- FROM {table_a} a
- LEFT JOIN {table_b} b ON {on_clause}
- GROUP BY ALL
- """
-
- if unique:
- # Report each A feature only once if overlaps exist
- on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}"
- return f"""
- SELECT DISTINCT a.*
- FROM {table_a} a
- JOIN {table_b} b ON {on_clause}
- """
-
- if left_outer_join or write_all_overlap:
- # Left outer join
- join_type = "LEFT JOIN"
- else:
- join_type = "JOIN"
-
- # Build select clause
- if write_a and not write_b:
- select_clause = "a.*"
- elif write_b and not write_a:
- select_clause = "b.*"
- else:
- # Default: write both A and B
- select_clause = "a.*, b.*"
-
- # Add overlap calculation if requested
- if write_overlap or write_all_overlap:
- # Calculate overlap size: min(end_a, end_b) - max(start_a, start_b)
- overlap_expr = f"""
- CASE
- WHEN b.\"{chrom_b}\" IS NULL THEN 0
- ELSE GREATEST(0,
- LEAST(a.\"{end_a}\", b.\"{end_b}\") -
- GREATEST(a.\"{start_a}\", b.\"{start_b}\")
- )
- END as overlap_bp
- """
- select_clause = f"{select_clause}, {overlap_expr}"
-
- # Build ON clause
- on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}"
-
- # Build base query
- query = f"""
- SELECT {select_clause}
- FROM {table_a} a
- {join_type} {table_b} b ON {on_clause}
- """
-
- return query
-
-
-if __name__ == "__main__":
- cli()
diff --git a/src/giql/constants.py b/src/giql/constants.py
index daa5896..87f8055 100644
--- a/src/giql/constants.py
+++ b/src/giql/constants.py
@@ -4,8 +4,8 @@
"""
# Default genomic column names
-DEFAULT_CHROM_COL = "chromosome"
-DEFAULT_START_COL = "start_pos"
-DEFAULT_END_COL = "end_pos"
+DEFAULT_CHROM_COL = "chrom"
+DEFAULT_START_COL = "start"
+DEFAULT_END_COL = "end"
DEFAULT_STRAND_COL = "strand"
DEFAULT_GENOMIC_COL = "interval"
diff --git a/src/giql/engine.py b/src/giql/engine.py
deleted file mode 100644
index b1c5087..0000000
--- a/src/giql/engine.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""Multi-backend query engine for GIQL.
-
-This module provides the main query engine that supports multiple SQL databases
-through transpilation of GIQL syntax to standard SQL.
-"""
-
-from typing import Literal
-
-import pandas as pd
-from sqlglot import parse_one
-
-from giql.constants import DEFAULT_CHROM_COL
-from giql.constants import DEFAULT_END_COL
-from giql.constants import DEFAULT_GENOMIC_COL
-from giql.constants import DEFAULT_START_COL
-from giql.constants import DEFAULT_STRAND_COL
-from giql.dialect import GIQLDialect
-from giql.generators import BaseGIQLGenerator
-from giql.generators import GIQLDuckDBGenerator
-from giql.protocols import CursorLike
-from giql.range_parser import CoordinateSystem
-from giql.range_parser import IntervalType
-from giql.schema import ColumnInfo
-from giql.schema import SchemaInfo
-from giql.schema import TableSchema
-from giql.transformer import ClusterTransformer
-from giql.transformer import MergeTransformer
-
-DialectType = Literal["duckdb", "sqlite"]
-
-
-class GIQLEngine:
- """Multi-backend GIQL query engine.
-
- Supports multiple SQL databases through transpilation of GIQL syntax
- to standard SQL. Can work with DuckDB, SQLite, and other backends.
-
- Examples
- --------
- Query a pandas DataFrame with DuckDB::
-
- import pandas as pd
- from giql import GIQLEngine
-
- df = pd.DataFrame(
- {
- "id": [1, 2, 3],
- "chromosome": ["chr1", "chr1", "chr2"],
- "start_pos": [1500, 10500, 500],
- "end_pos": [1600, 10600, 600],
- }
- )
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.conn.register("variants", df)
- cursor = engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- for row in cursor:
- print(row)
-
- Load from CSV::
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", "variants.csv")
- cursor = engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- # Process rows lazily
- while True:
- row = cursor.fetchone()
- if row is None:
- break
- print(row)
-
- Using SQLite backend::
-
- with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine:
- cursor = engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- # Materialize all results at once
- results = cursor.fetchall()
- """
-
- def __init__(
- self,
- target_dialect: DialectType | str = "duckdb",
- connection=None,
- db_path: str = ":memory:",
- verbose: bool = False,
- **dialect_options,
- ):
- """Initialize engine.
-
- :param target_dialect:
- Target SQL dialect ('duckdb', 'sqlite', 'standard')
- :param connection:
- Existing database connection (optional)
- :param db_path:
- Database path or connection string
- :param verbose:
- Print transpiled SQL
- :param dialect_options:
- Additional options for specific dialects
- """
- self.target_dialect = target_dialect
- self.verbose = verbose
- self.schema_info = SchemaInfo()
- self.dialect_options = dialect_options
-
- # Initialize connection
- if connection:
- self.conn = connection
- self.owns_connection = False
- else:
- self.conn = self._create_connection(db_path)
- self.owns_connection = True
-
- # Get appropriate generator
- self.generator = self._get_generator()
-
- # Initialize query transformers
- self.cluster_transformer = ClusterTransformer(self.schema_info)
- self.merge_transformer = MergeTransformer(self.schema_info)
-
- def _create_connection(self, db_path: str):
- """Create database connection based on target dialect.
-
- :param db_path:
- Path to database file or connection string
- :return:
- Connection object for the specified database backend
- :raises ImportError:
- If the required database driver is not installed
- :raises ValueError:
- If the dialect is not supported
- """
- if self.target_dialect == "duckdb":
- try:
- import duckdb
-
- return duckdb.connect(db_path)
- except ImportError:
- raise ImportError("DuckDB not installed.")
-
- elif self.target_dialect == "sqlite":
- import sqlite3
-
- return sqlite3.connect(db_path)
-
- else:
- raise ValueError(
- f"Unsupported dialect: {self.target_dialect}. Supported: duckdb, sqlite"
- )
-
- def _get_generator(self):
- """Get generator for target dialect.
-
- :return:
- SQL generator instance configured for the target dialect
- """
- generators = {
- "duckdb": GIQLDuckDBGenerator,
- "sqlite": BaseGIQLGenerator,
- "standard": BaseGIQLGenerator,
- }
-
- generator_class = generators.get(self.target_dialect, BaseGIQLGenerator)
- return generator_class(schema_info=self.schema_info, **self.dialect_options)
-
- def register_table_schema(
- self,
- table_name: str,
- columns: dict[str, str],
- genomic_column: str = DEFAULT_GENOMIC_COL,
- chrom_col: str = DEFAULT_CHROM_COL,
- start_col: str = DEFAULT_START_COL,
- end_col: str = DEFAULT_END_COL,
- strand_col: str | None = DEFAULT_STRAND_COL,
- coordinate_system: str = "0based",
- interval_type: str = "half_open",
- ):
- """Register schema for a table.
-
- This method tells the engine how genomic ranges are stored in the table,
- mapping logical genomic column names to physical column names.
-
- :param table_name:
- Table name
- :param columns:
- Dict of column_name -> type
- :param genomic_column:
- Logical name for genomic position
- :param chrom_col:
- Physical chromosome column
- :param start_col:
- Physical start position column
- :param end_col:
- Physical end position column
- :param strand_col:
- Physical strand column (optional)
- :param coordinate_system:
- Coordinate system: "0based" or "1based" (default: "0based")
- :param interval_type:
- Interval endpoint handling: "half_open" or "closed" (default: "half_open")
- """
- # Convert string parameters to enums
- coord_sys = (
- CoordinateSystem.ONE_BASED
- if coordinate_system == "1based"
- else CoordinateSystem.ZERO_BASED
- )
- int_type = (
- IntervalType.CLOSED if interval_type == "closed" else IntervalType.HALF_OPEN
- )
-
- column_infos = {}
-
- for col_name, col_type in columns.items():
- column_infos[col_name] = ColumnInfo(
- name=col_name, type=col_type, is_genomic=False
- )
-
- # Add virtual genomic column with mappings to physical columns
- column_infos[genomic_column] = ColumnInfo(
- name=genomic_column,
- type="GENOMIC_RANGE", # Virtual type
- is_genomic=True,
- chrom_col=chrom_col,
- start_col=start_col,
- end_col=end_col,
- strand_col=strand_col,
- coordinate_system=coord_sys,
- interval_type=int_type,
- )
-
- table_schema = TableSchema(table_name, column_infos)
- self.schema_info.register_table(table_name, table_schema)
-
- def load_csv(self, table_name: str, file_path: str):
- """Load CSV file into database.
-
- :param table_name:
- Name to assign to the table
- :param file_path:
- Path to the CSV file
- """
- if self.target_dialect == "duckdb":
- self.conn.execute(
- f"CREATE TABLE {table_name} "
- f"AS SELECT * FROM read_csv_auto('{file_path}')"
- )
- elif self.target_dialect == "sqlite":
- # Use pandas for SQLite
- df = pd.read_csv(file_path)
- df.to_sql(table_name, self.conn, if_exists="replace", index=False)
-
- if self.verbose:
- print(f"Loaded {table_name} from {file_path}")
-
- def load_parquet(self, table_name: str, file_path: str):
- """Load Parquet file into database.
-
- :param table_name:
- Name to assign to the table
- :param file_path:
- Path to the Parquet file
- """
- if self.target_dialect == "duckdb":
- self.conn.execute(
- f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')"
- )
- else:
- df = pd.read_parquet(file_path)
- df.to_sql(table_name, self.conn, if_exists="replace", index=False)
-
- if self.verbose:
- print(f"Loaded {table_name} from {file_path}")
-
- def transpile(self, giql: str) -> str:
- """Transpile a GIQL query to the engine's target SQL dialect.
-
- Parses the GIQL syntax and transpiles it to the target SQL dialect
- without executing it. Useful for debugging or generating SQL for
- external use.
-
- :param giql:
- Query string with GIQL genomic extensions
- :return:
- Transpiled SQL query string in the target dialect
- :raises ValueError:
- If the query cannot be parsed or transpiled
- """
- # Parse with GIQL dialect
- try:
- ast = parse_one(giql, dialect=GIQLDialect)
- except Exception as e:
- raise ValueError(f"Parse error: {e}\nQuery: {giql}")
-
- # Transform query (MERGE first, then CLUSTER)
- try:
- # Apply MERGE transformation (which may internally use CLUSTER)
- ast = self.merge_transformer.transform(ast)
- # Apply CLUSTER transformation for any standalone CLUSTER expressions
- ast = self.cluster_transformer.transform(ast)
- except Exception as e:
- raise ValueError(f"Transformation error: {e}")
-
- # Transpile to target dialect
- try:
- target_sql = self.generator.generate(ast)
- except Exception as e:
- raise ValueError(f"Transpilation error: {e}")
-
- if self.verbose:
- print(f"\n{'=' * 60}")
- print(f"Target Dialect: {self.target_dialect}")
- print("\nOriginal GIQL:")
- print(giql)
- print("\nTranspiled SQL:")
- print(target_sql)
- print(f"{'=' * 60}\n")
-
- return target_sql
-
- def execute(self, giql: str) -> CursorLike:
- """Execute a GIQL query and return a database cursor.
-
- Parses the GIQL syntax, transpiles to target SQL dialect,
- and executes the query returning a cursor for lazy iteration.
-
- :param giql:
- Query string with GIQL genomic extensions
- :return:
- Database cursor (DB-API 2.0 compatible) that can be iterated
- :raises ValueError:
- If the query cannot be parsed, transpiled, or executed
- """
- # Transpile GIQL to target SQL
- target_sql = self.transpile(giql)
-
- # Execute and return cursor
- try:
- return self.conn.execute(target_sql)
- except Exception as e:
- raise ValueError(f"Execution error: {e}\nSQL: {target_sql}")
-
- def execute_raw(self, sql: str) -> pd.DataFrame:
- """Execute raw SQL directly, bypassing GIQL parsing.
-
- :param sql:
- Raw SQL query string
- :return:
- Query results as a pandas DataFrame
- """
- return pd.read_sql(sql, self.conn)
-
- def close(self):
- """Close database connection.
-
- Only closes connections created by the engine. If an external
- connection was provided during initialization, it is not closed.
- """
- if self.owns_connection and self.conn:
- self.conn.close()
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb): # noqa: ANN001
- self.close()
diff --git a/src/giql/generators/__init__.py b/src/giql/generators/__init__.py
index b04bd93..ca8cb16 100644
--- a/src/giql/generators/__init__.py
+++ b/src/giql/generators/__init__.py
@@ -1,9 +1,5 @@
-"""
-SQL generators for different database dialects.
-"""
+"""SQL generators for GIQL transpilation."""
from giql.generators.base import BaseGIQLGenerator
-from giql.generators.duckdb import GIQLDuckDBGenerator
-from giql.generators.sqlite import GIQLSQLiteGenerator
-__all__ = ["BaseGIQLGenerator", "GIQLDuckDBGenerator", "GIQLSQLiteGenerator"]
+__all__ = ["BaseGIQLGenerator"]
diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py
index 2313821..316cd00 100644
--- a/src/giql/generators/base.py
+++ b/src/giql/generators/base.py
@@ -15,7 +15,7 @@
from giql.expressions import Within
from giql.range_parser import ParsedRange
from giql.range_parser import RangeParser
-from giql.schema import SchemaInfo
+from giql.table import Tables
class BaseGIQLGenerator(Generator):
@@ -48,9 +48,9 @@ def _extract_bool_param(param_expr: Optional[exp.Expression]) -> bool:
else:
return str(param_expr).upper() in ("TRUE", "1", "YES")
- def __init__(self, schema_info: Optional[SchemaInfo] = None, **kwargs):
+ def __init__(self, tables: Optional[Tables] = None, **kwargs):
super().__init__(**kwargs)
- self.schema_info = schema_info or SchemaInfo()
+ self.tables = tables or Tables()
self._current_table = None # Track current table for column resolution
self._alias_to_table = {} # Map aliases to table names
@@ -187,40 +187,26 @@ def giqlnearest_sql(self, expression: GIQLNearest) -> str:
else:
# Implicit reference in correlated mode - get strand from outer table
outer_table = self._find_outer_table_in_lateral_join(expression)
- if outer_table and self.schema_info:
+ if outer_table and self.tables:
actual_table = self._alias_to_table.get(outer_table, outer_table)
- table_schema = self.schema_info.get_table(actual_table)
- if table_schema:
- for col_info in table_schema.columns.values():
- if col_info.is_genomic and col_info.strand_col:
- ref_strand = f'{outer_table}."{col_info.strand_col}"'
- break
+ table = self.tables.get(actual_table)
+ if table and table.strand_col:
+ ref_strand = f'{outer_table}."{table.strand_col}"'
# Get strand column for target table
- target_table_info = (
- self.schema_info.get_table(table_name) if self.schema_info else None
- )
- if target_table_info:
- for col_info in target_table_info.columns.values():
- if col_info.is_genomic and col_info.strand_col:
- target_strand = f'{table_name}."{col_info.strand_col}"'
- break
+ target_table = self.tables.get(table_name) if self.tables else None
+ if target_table and target_table.strand_col:
+ target_strand = f'{table_name}."{target_table.strand_col}"'
# Determine if we should add 1 for gap distances (bedtools compatibility)
# This depends on the interval types of the tables involved
add_one = False
- if self.schema_info:
- target_table_info = self.schema_info.get_table(table_name)
- if target_table_info:
- for col_info in target_table_info.columns.values():
- if col_info.is_genomic:
- # Import IntervalType to check
- from giql.range_parser import IntervalType
-
- # Add 1 for closed intervals (bedtools behavior)
- if col_info.interval_type == IntervalType.CLOSED:
- add_one = True
- break
+ if self.tables:
+ target_table = self.tables.get(table_name)
+ if target_table:
+ # Add 1 for closed intervals (bedtools behavior)
+ if target_table.interval_type == "closed":
+ add_one = True
# Build distance calculation using CASE expression
# For NEAREST: ORDER BY absolute distance, but RETURN signed distance
@@ -338,30 +324,25 @@ def giqldistance_sql(self, expression: GIQLDistance) -> str:
raise ValueError("Literal range as second argument not yet supported")
# Determine if we should add 1 for gap distances (bedtools compatibility)
- # Check interval types from schema
+ # Check interval types from table config
add_one = False
- if self.schema_info:
+ if self.tables:
# Extract table names from column references
# Column refs look like "table.column" or "alias.column"
table_a = interval_a_sql.split(".")[0] if "." in interval_a_sql else None
table_b = interval_b_sql.split(".")[0] if "." in interval_b_sql else None
# Check if either table uses closed intervals
- from giql.range_parser import IntervalType
-
- for table_name in [table_a, table_b]:
- if table_name:
+ for tbl_name in [table_a, table_b]:
+ if tbl_name:
# Remove quotes if present
- table_name = table_name.strip('"')
+ tbl_name = tbl_name.strip('"')
# Check if it's an alias first
- actual_table = self._alias_to_table.get(table_name, table_name)
- table_info = self.schema_info.get_table(actual_table)
- if table_info:
- for col_info in table_info.columns.values():
- if col_info.is_genomic:
- if col_info.interval_type == IntervalType.CLOSED:
- add_one = True
- break
+ actual_table = self._alias_to_table.get(tbl_name, tbl_name)
+ table = self.tables.get(actual_table)
+ if table and table.interval_type == "closed":
+ add_one = True
+ break
# Generate CASE expression
return self._generate_distance_case(
@@ -776,32 +757,19 @@ def _resolve_nearest_reference(
"Please specify reference parameter explicitly."
)
- # Look up the table's schema to find the genomic column
+ # Look up the table to find the genomic column
# Check if outer_table is an alias
actual_table = self._alias_to_table.get(outer_table, outer_table)
- table_schema = self.schema_info.get_table(actual_table)
-
- if not table_schema:
- raise ValueError(
- f"Outer table '{outer_table}' not found in schema. "
- "Please specify reference parameter explicitly."
- )
+ table = self.tables.get(actual_table)
- # Find the genomic column in the table schema
- genomic_col_name = None
- for col_info in table_schema.columns.values():
- if col_info.is_genomic:
- genomic_col_name = col_info.name
- break
-
- if not genomic_col_name:
+ if not table:
raise ValueError(
- f"No genomic column found in table '{outer_table}'. "
+ f"Outer table '{outer_table}' not found in tables. "
"Please specify reference parameter explicitly."
)
# Build column references using the outer table and genomic column
- reference_sql = f"{outer_table}.{genomic_col_name}"
+ reference_sql = f"{outer_table}.{table.genomic_col}"
return self._get_column_refs(reference_sql, None)
def _resolve_target_table(
@@ -828,31 +796,15 @@ def _resolve_target_table(
# Try to extract as string
table_name = str(target)
- table_schema = self.schema_info.get_table(table_name)
- if not table_schema:
+ table = self.tables.get(table_name)
+ if not table:
raise ValueError(
- f"Target table '{table_name}' not found in schema. "
- f"Available tables: {list(self.schema_info.tables.keys())}"
+ f"Target table '{table_name}' not found in tables. "
+ "Register the table before transpiling."
)
- # Find genomic column in target table
- genomic_col = None
- for col_info in table_schema.columns.values():
- if col_info.is_genomic:
- genomic_col = col_info
- break
-
- if not genomic_col:
- raise ValueError(
- f"Target table '{table_name}' does not have a genomic column"
- )
-
- # Get physical column names
- chrom_col = genomic_col.chrom_col or DEFAULT_CHROM_COL
- start_col = genomic_col.start_col or DEFAULT_START_COL
- end_col = genomic_col.end_col or DEFAULT_END_COL
-
- return table_name, (chrom_col, start_col, end_col)
+ # Get physical column names from table config
+ return table_name, (table.chrom_col, table.start_col, table.end_col)
def _get_column_refs(
self,
@@ -887,22 +839,15 @@ def _get_column_refs(
# Look up actual table name from alias
table_name = self._alias_to_table.get(table_alias, self._current_table)
- # Try to get custom column names from schema
- if table_name and self.schema_info:
- table_schema = self.schema_info.get_table(table_name)
- if table_schema:
- # Find the genomic column
- for col_info in table_schema.columns.values():
- if col_info.is_genomic:
- if col_info.chrom_col:
- chrom_col = col_info.chrom_col
- if col_info.start_col:
- start_col = col_info.start_col
- if col_info.end_col:
- end_col = col_info.end_col
- if col_info.strand_col:
- strand_col = col_info.strand_col
- break
+ # Try to get custom column names from table config
+ if table_name and self.tables:
+ table = self.tables.get(table_name)
+ if table:
+ chrom_col = table.chrom_col
+ start_col = table.start_col
+ end_col = table.end_col
+ if table.strand_col:
+ strand_col = table.strand_col
# Format with table alias if present
if table_alias:
diff --git a/src/giql/generators/duckdb.py b/src/giql/generators/duckdb.py
deleted file mode 100644
index bbe5a64..0000000
--- a/src/giql/generators/duckdb.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from sqlglot.dialects.duckdb import DuckDB
-
-from giql.generators.base import BaseGIQLGenerator
-
-
-class GIQLDuckDBGenerator(BaseGIQLGenerator, DuckDB.Generator):
- """DuckDB-specific generator with optimizations."""
diff --git a/src/giql/generators/sqlite.py b/src/giql/generators/sqlite.py
deleted file mode 100644
index 922e459..0000000
--- a/src/giql/generators/sqlite.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import Final
-
-from sqlglot.dialects.sqlite import SQLite
-
-from giql.generators.base import BaseGIQLGenerator
-
-
-class GIQLSQLiteGenerator(BaseGIQLGenerator, SQLite.Generator):
- """SQLite-specific SQL generator.
-
- SQLite does not support LATERAL joins, so correlated NEAREST queries
- (without explicit reference) will raise an error. Use standalone mode
- with an explicit reference parameter instead.
-
- Example::
-
- -- This works (standalone mode with explicit reference):
- SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)
-
- -- This fails (correlated mode requires LATERAL):
- SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)
- """
-
- SUPPORTS_LATERAL: Final = False
diff --git a/src/giql/protocols.py b/src/giql/protocols.py
deleted file mode 100644
index 9002051..0000000
--- a/src/giql/protocols.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""Protocol definitions for GIQL.
-
-This module defines protocols for type checking and interface compatibility.
-"""
-
-from typing import Any
-from typing import Protocol
-from typing import Sequence
-
-
-class CursorLike(Protocol):
- """Protocol for DB-API 2.0 compatible cursors.
-
- Based on PEP 249: https://peps.python.org/pep-0249/
-
- This protocol defines the minimal interface required for database cursors
- that can be used with GIQL. All DB-API 2.0 compliant drivers (SQLite,
- PostgreSQL, MySQL, DuckDB) implement this interface.
- """
-
- @property
- def description(
- self,
- ) -> (
- Sequence[
- tuple[str, Any, Any | None, Any | None, Any | None, Any | None, Any | None]
- ]
- | None
- ):
- """Column descriptions.
-
- A sequence of 7-tuples describing each column:
- (name, type_code, display_size, internal_size, precision, scale, null_ok)
-
- Only 'name' is required; other values may be None.
- Returns None if no operation has been performed yet.
- """
- ...
-
- @property
- def rowcount(self) -> int:
- """Number of rows affected by last operation.
-
- Returns -1 if no operation has been performed or if the count
- cannot be determined.
- """
- ...
-
- def fetchone(self) -> tuple[Any, ...] | None:
- """Fetch the next row of a query result set.
-
- Returns a tuple representing the next row, or None when no more
- rows are available.
- """
- ...
-
- def fetchmany(self, size: int = 1) -> list[tuple[Any, ...]]:
- """Fetch the next set of rows of a query result set.
-
- Returns a list of tuples. An empty list is returned when no more
- rows are available.
-
- :param size:
- Number of rows to fetch (default: 1)
- """
- ...
-
- def fetchall(self) -> list[tuple[Any, ...]]:
- """Fetch all remaining rows of a query result set.
-
- Returns a list of tuples. An empty list is returned when no rows
- are available.
- """
- ...
-
- def close(self) -> None:
- """Close the cursor.
-
- Makes the cursor unusable for further operations.
- """
- ...
diff --git a/src/giql/schema.py b/src/giql/schema.py
deleted file mode 100644
index 1b6e0d5..0000000
--- a/src/giql/schema.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Schema information for transpilation.
-
-This module manages schema metadata for tables, including how genomic
-ranges are physically stored in the database.
-"""
-
-from dataclasses import dataclass
-from typing import Dict
-from typing import Optional
-
-from giql.range_parser import CoordinateSystem
-from giql.range_parser import IntervalType
-
-
-@dataclass
-class ColumnInfo:
- """Information about a column."""
-
- name: str
- type: str
- is_genomic: bool = False
- # For genomic columns stored as separate fields
- chrom_col: Optional[str] = None
- start_col: Optional[str] = None
- end_col: Optional[str] = None
- strand_col: Optional[str] = None
- # Coordinate system configuration for genomic columns
- coordinate_system: CoordinateSystem = CoordinateSystem.ZERO_BASED
- interval_type: IntervalType = IntervalType.HALF_OPEN
-
-
-@dataclass
-class TableSchema:
- """Schema for a table."""
-
- name: str
- columns: Dict[str, ColumnInfo]
-
-
-class SchemaInfo:
- """Manages schema information for transpilation.
-
- Tracks how genomic ranges are stored:
- - Separate columns (chromosome, start_pos, end_pos)
- - STRUCT types
- - Custom types
- """
-
- def __init__(self):
- self.tables: Dict[str, TableSchema] = {}
-
- def register_table(self, name: str, schema: TableSchema):
- """Register a table schema.
-
- :param name: Table name
- :param schema: TableSchema object
- """
- self.tables[name] = schema
-
- def get_table(self, name: str) -> Optional[TableSchema]:
- """Get table schema by name.
-
- :param name:
- Table name
- :return:
- TableSchema object or None if not found
- """
- return self.tables.get(name)
-
- def get_column_info(self, table: str, column: str) -> Optional[ColumnInfo]:
- """Get column information.
-
- :param table:
- Table name
- :param column:
- Column name
- :return:
- ColumnInfo object or None if not found
- """
- table_schema = self.get_table(table)
- if table_schema:
- return table_schema.columns.get(column)
- return None
diff --git a/src/giql/table.py b/src/giql/table.py
new file mode 100644
index 0000000..f23adaf
--- /dev/null
+++ b/src/giql/table.py
@@ -0,0 +1,136 @@
+"""Table configuration for GIQL transpilation.
+
+This module defines the Table dataclass for configuring genomic table schemas.
+"""
+
+from dataclasses import dataclass
+from typing import Literal
+
+from giql.constants import DEFAULT_CHROM_COL
+from giql.constants import DEFAULT_END_COL
+from giql.constants import DEFAULT_GENOMIC_COL
+from giql.constants import DEFAULT_START_COL
+from giql.constants import DEFAULT_STRAND_COL
+
+
+@dataclass
+class Table:
+ """Genomic table configuration for transpilation.
+
+ This class defines how genomic intervals are stored in a database table,
+ mapping a pseudo-column name (genomic_col) to the physical columns that
+ store chromosome, start, end, and optionally strand information.
+
+ Parameters
+ ----------
+ name : str
+ The table name.
+ genomic_col : str
+ The pseudo-column name used in GIQL queries to reference the genomic
+ interval (default: "interval").
+ chrom_col : str
+ The physical column name storing chromosome/contig (default: "chrom").
+ start_col : str
+ The physical column name storing interval start position
+ (default: "start").
+ end_col : str
+ The physical column name storing interval end position
+ (default: "end").
+ strand_col : str | None
+ The physical column name storing strand information, or None if the
+ table has no strand column (default: "strand").
+ coordinate_system : Literal["0based", "1based"]
+ The coordinate system used for positions (default: "0based").
+ interval_type : Literal["half_open", "closed"]
+ The interval endpoint convention (default: "half_open").
+
+ Examples
+ --------
+ Using default column names (via transpile)::
+
+ sql = transpile(query, tables=["peaks"])
+
+ Mixing default and custom table configurations::
+
+ sql = transpile(
+ query,
+ tables=[
+ "peaks",
+ Table(
+ "variants",
+ genomic_col="position",
+ chrom_col="chr",
+ start_col="pos_start",
+ end_col="pos_end",
+ strand_col=None, # No strand column
+ coordinate_system="1based",
+ interval_type="closed",
+ ),
+ ]
+ )
+ """
+
+ name: str
+ genomic_col: str = DEFAULT_GENOMIC_COL
+ chrom_col: str = DEFAULT_CHROM_COL
+ start_col: str = DEFAULT_START_COL
+ end_col: str = DEFAULT_END_COL
+ strand_col: str | None = DEFAULT_STRAND_COL
+ coordinate_system: Literal["0based", "1based"] = "0based"
+ interval_type: Literal["half_open", "closed"] = "half_open"
+
+ def __post_init__(self) -> None:
+ """Validate field values after initialization."""
+ if self.coordinate_system not in ("0based", "1based"):
+ raise ValueError(
+ f"coordinate_system must be '0based' or '1based', "
+ f"got {self.coordinate_system!r}"
+ )
+ if self.interval_type not in ("half_open", "closed"):
+ raise ValueError(
+ f"interval_type must be 'half_open' or 'closed', "
+ f"got {self.interval_type!r}"
+ )
+
+
+class Tables:
+ """Container for Table configurations.
+
+ Provides lookup of Table objects by name for use during transpilation.
+ """
+
+ def __init__(self) -> None:
+ self._tables: dict[str, Table] = {}
+
+ def register(self, name: str, table: Table) -> None:
+ """Register a table configuration.
+
+ Parameters
+ ----------
+ name : str
+ The table name to register.
+ table : Table
+ Table configuration to register.
+ """
+ self._tables[name] = table
+
+ def get(self, name: str) -> Table | None:
+ """Get a table configuration by name.
+
+ Parameters
+ ----------
+ name : str
+ Table name to look up.
+
+ Returns
+ -------
+ Table | None
+ Table configuration if found, None otherwise.
+ """
+ return self._tables.get(name)
+
+ def __contains__(self, name: str) -> bool:
+ return name in self._tables
+
+ def __iter__(self):
+ return iter(self._tables.values())
diff --git a/src/giql/transformer.py b/src/giql/transformer.py
index 2d9705f..de1e70f 100644
--- a/src/giql/transformer.py
+++ b/src/giql/transformer.py
@@ -12,7 +12,7 @@
from giql.constants import DEFAULT_STRAND_COL
from giql.expressions import GIQLCluster
from giql.expressions import GIQLMerge
-from giql.schema import SchemaInfo
+from giql.table import Tables
class ClusterTransformer:
@@ -32,13 +32,13 @@ class ClusterTransformer:
FROM lag_calc
"""
- def __init__(self, schema_info: SchemaInfo):
+ def __init__(self, tables: Tables):
"""Initialize transformer.
- :param schema_info:
- Schema information for column mapping
+ :param tables:
+ Table configurations for column mapping
"""
- self.schema_info = schema_info
+ self.tables = tables
def _get_table_name(self, query: exp.Select) -> str | None:
"""Extract table name from query's FROM clause.
@@ -58,7 +58,7 @@ def _get_table_name(self, query: exp.Select) -> str | None:
return None
def _get_genomic_columns(self, query: exp.Select) -> tuple[str, str, str, str]:
- """Get genomic column names from schema info or defaults.
+ """Get genomic column names from table config or defaults.
:param query:
Query to extract table and column info from
@@ -74,20 +74,13 @@ def _get_genomic_columns(self, query: exp.Select) -> tuple[str, str, str, str]:
strand_col = DEFAULT_STRAND_COL
if table_name:
- table_schema = self.schema_info.get_table(table_name)
- if table_schema:
- # Find the genomic column
- for col_info in table_schema.columns.values():
- if col_info.is_genomic:
- if col_info.chrom_col:
- chrom_col = col_info.chrom_col
- if col_info.start_col:
- start_col = col_info.start_col
- if col_info.end_col:
- end_col = col_info.end_col
- if col_info.strand_col:
- strand_col = col_info.strand_col
- break
+ table = self.tables.get(table_name)
+ if table:
+ chrom_col = table.chrom_col
+ start_col = table.start_col
+ end_col = table.end_col
+ if table.strand_col:
+ strand_col = table.strand_col
return chrom_col, start_col, end_col, strand_col
@@ -209,7 +202,7 @@ def _transform_for_cluster(
else:
stranded = False
- # Get column names from schema_info or use defaults
+ # Get column names from table config or use defaults
chrom_col, start_col, end_col, strand_col = self._get_genomic_columns(query)
# Build partition clause
@@ -366,14 +359,14 @@ class MergeTransformer:
ORDER BY chromosome, start_pos
"""
- def __init__(self, schema_info: SchemaInfo):
+ def __init__(self, tables: Tables):
"""Initialize transformer.
- :param schema_info:
- Schema information for column mapping
+ :param tables:
+ Table configurations for column mapping
"""
- self.schema_info = schema_info
- self.cluster_transformer = ClusterTransformer(schema_info)
+ self.tables = tables
+ self.cluster_transformer = ClusterTransformer(tables)
def transform(self, query: exp.Expression) -> exp.Expression:
"""Transform query if it contains MERGE expressions.
@@ -468,7 +461,7 @@ def _transform_for_merge(
distance_expr = merge_expr.args.get("distance")
stranded_expr = merge_expr.args.get("stranded")
- # Get column names from schema_info or use defaults
+ # Get column names from table config or use defaults
(
chrom_col,
start_col,
diff --git a/src/giql/transpile.py b/src/giql/transpile.py
new file mode 100644
index 0000000..f846834
--- /dev/null
+++ b/src/giql/transpile.py
@@ -0,0 +1,129 @@
+"""Transpile GIQL queries to SQL.
+
+This module provides the main entry point for transpiling GIQL queries
+to standard SQL.
+"""
+
+from sqlglot import parse_one
+
+from giql.dialect import GIQLDialect
+from giql.generators import BaseGIQLGenerator
+from giql.table import Table
+from giql.table import Tables
+from giql.transformer import ClusterTransformer
+from giql.transformer import MergeTransformer
+
+
+def _build_tables(tables: list[str | Table] | None) -> Tables:
+ """Build a Tables container from table specifications.
+
+ Parameters
+ ----------
+ tables : list[str | Table] | None
+ Table specifications. Strings use default column mappings.
+ Table objects provide custom column mappings.
+
+ Returns
+ -------
+ Tables
+ Container with all tables registered.
+ """
+ container = Tables()
+
+ if tables is None:
+ return container
+
+ for item in tables:
+ if isinstance(item, str):
+ container.register(item, Table(item))
+ else:
+ container.register(item.name, item)
+
+ return container
+
+
+def transpile(
+ giql: str,
+ tables: list[str | Table] | None = None,
+) -> str:
+ """Transpile a GIQL query to SQL.
+
+ Parses the GIQL syntax and converts it to standard SQL-92 compatible
+ output (uses LATERAL joins where needed for operations like NEAREST).
+
+ Parameters
+ ----------
+ giql : str
+ The GIQL query string containing genomic extensions like
+ INTERSECTS, CONTAINS, WITHIN, CLUSTER, MERGE, or NEAREST.
+ tables : list[str | Table] | None
+ Table configurations. Strings use default column mappings
+ (chrom, start, end, strand). Table objects provide custom
+ column name mappings.
+
+ Returns
+ -------
+ str
+ The transpiled SQL query.
+
+ Raises
+ ------
+ ValueError
+ If the query cannot be parsed or transpiled.
+
+ Examples
+ --------
+ Basic usage with default column mappings::
+
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["peaks"]
+ )
+
+ Custom table configuration::
+
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=[
+ Table(
+ "peaks",
+ genomic_col="interval",
+ chrom_col="chrom",
+ start_col="start",
+ end_col="end",
+ )
+ ]
+ )
+ """
+ # Build tables container
+ tables_container = _build_tables(tables)
+
+ # Initialize transformers with table configurations
+ merge_transformer = MergeTransformer(tables_container)
+ cluster_transformer = ClusterTransformer(tables_container)
+
+ # Initialize generator with table configurations
+ generator = BaseGIQLGenerator(tables=tables_container)
+
+ # Parse GIQL query
+ try:
+ ast = parse_one(giql, dialect=GIQLDialect)
+ except Exception as e:
+ raise ValueError(f"Parse error: {e}\nQuery: {giql}") from e
+
+ # Apply transformations (MERGE first, then CLUSTER)
+ try:
+ # MERGE transformation (which may internally use CLUSTER)
+ ast = merge_transformer.transform(ast)
+ # CLUSTER transformation for any standalone CLUSTER expressions
+ ast = cluster_transformer.transform(ast)
+ except Exception as e:
+ raise ValueError(f"Transformation error: {e}") from e
+
+ # Generate SQL
+ try:
+ sql = generator.generate(ast)
+ except Exception as e:
+ raise ValueError(f"Transpilation error: {e}") from e
+
+ return sql
diff --git a/tests/conftest.py b/tests/conftest.py
index 36b4f05..2ddb618 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,22 +1,18 @@
-"""
-Pytest fixtures for integration tests.
-"""
+"""Pytest fixtures for GIQL tests."""
import pandas as pd
import pytest
-from giql import GIQLEngine
-
@pytest.fixture(scope="session")
def to_df():
- """Fixture providing a helper to convert cursors to DataFrames.
+ """Fixture providing a helper to convert DuckDB results to DataFrames.
- Returns a function that materializes cursor results for testing.
+ Returns a function that materializes query results for testing.
Session-scoped since it's a pure function with no state.
Usage:
- result = to_df(engine.execute("SELECT ..."))
+ result = to_df(conn.execute("SELECT ..."))
"""
def _to_df(cursor):
@@ -26,155 +22,3 @@ def _to_df(cursor):
return pd.DataFrame()
return _to_df
-
-
-@pytest.fixture
-def sample_variants_csv(tmp_path):
- """Create sample variants CSV."""
- csv_content = """
- id,chromosome,start_pos,end_pos,ref,alt,quality
- 1,chr1,1500,1600,A,T,30.0
- 2,chr1,10500,10600,G,C,40.0
- 3,chr1,15000,15100,T,A,25.0
- 4,chr2,500,600,C,G,35.0
- 5,chr2,5500,5600,A,T,20.0
- 6,chr1,25000,25100,G,A,35.0
- 7,chr2,15000,15100,T,C,28.0
- 8,chr3,1000,1100,A,G,32.0
- """
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content.strip())
- return str(csv_path)
-
-
-@pytest.fixture
-def sample_genes_csv(tmp_path):
- """Create sample genes CSV."""
- csv_content = """
- gene_id,name,chromosome,start_pos,end_pos,strand
- 1,GENE1,chr1,1000,2000,+
- 2,GENE2,chr1,10000,11000,-
- 3,GENE3,chr1,14000,16000,+
- 4,GENE4,chr2,400,700,+
- 5,GENE5,chr2,5000,6000,-
- """
- csv_path = tmp_path / "genes.csv"
- csv_path.write_text(csv_content.strip())
- return str(csv_path)
-
-
-@pytest.fixture(params=["duckdb", "sqlite"])
-def engine_with_variants(request, sample_variants_csv):
- """Create engine with loaded variants data for different dialects."""
- dialect = request.param
-
- engine = GIQLEngine(target_dialect=dialect, verbose=False)
- engine.load_csv("variants", sample_variants_csv)
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "ref": "VARCHAR",
- "alt": "VARCHAR",
- "quality": "FLOAT",
- },
- genomic_column="interval",
- )
-
- yield engine
- engine.close()
-
-
-@pytest.fixture
-def duckdb_engine_with_data(sample_variants_csv, sample_genes_csv):
- """DuckDB engine with both variants and genes loaded."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=False)
- engine.load_csv("variants", sample_variants_csv)
- engine.load_csv("genes", sample_genes_csv)
-
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "ref": "VARCHAR",
- "alt": "VARCHAR",
- "quality": "FLOAT",
- },
- genomic_column="interval",
- )
-
- engine.register_table_schema(
- "genes",
- {
- "gene_id": "INTEGER",
- "name": "VARCHAR",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "strand": "VARCHAR",
- },
- genomic_column="interval",
- )
-
- yield engine
- engine.close()
-
-
-@pytest.fixture
-def sample_peaks_csv(tmp_path):
- """Create sample ChIP-seq peaks CSV for NEAREST testing."""
- csv_content = """
- peak_id,chromosome,start_pos,end_pos,signal
- 1,chr1,5000,5200,100.5
- 2,chr1,12000,12100,85.2
- 3,chr1,20000,20500,120.8
- 4,chr2,3000,3100,95.3
- 5,chr2,8000,8200,110.7
- """
- csv_path = tmp_path / "peaks.csv"
- csv_path.write_text(csv_content.strip())
- return str(csv_path)
-
-
-@pytest.fixture
-def engine_with_peaks_and_genes(request, sample_peaks_csv, sample_genes_csv):
- """Create engine with peaks and genes loaded for NEAREST testing."""
- dialect = request.param if hasattr(request, "param") else "duckdb"
-
- engine = GIQLEngine(target_dialect=dialect, verbose=False)
- engine.load_csv("peaks", sample_peaks_csv)
- engine.load_csv("genes", sample_genes_csv)
-
- engine.register_table_schema(
- "peaks",
- {
- "peak_id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "signal": "FLOAT",
- },
- genomic_column="interval",
- )
-
- engine.register_table_schema(
- "genes",
- {
- "gene_id": "INTEGER",
- "name": "VARCHAR",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "strand": "VARCHAR",
- },
- genomic_column="interval",
- )
-
- yield engine
- engine.close()
diff --git a/tests/generators/test_base.py b/tests/generators/test_base.py
index b04b3bb..bc169b4 100644
--- a/tests/generators/test_base.py
+++ b/tests/generators/test_base.py
@@ -11,125 +11,45 @@
from sqlglot import exp
from sqlglot import parse_one
+from giql import Table
from giql.dialect import GIQLDialect
from giql.expressions import GIQLNearest
from giql.generators import BaseGIQLGenerator
-from giql.range_parser import IntervalType
-from giql.schema import ColumnInfo
-from giql.schema import SchemaInfo
-from giql.schema import TableSchema
+from giql.table import Tables
@pytest.fixture
-def schema_info():
- """Basic SchemaInfo with a single table containing genomic columns."""
- schema = SchemaInfo()
- table = TableSchema(name="variants", columns={})
- table.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["variants"] = table
- return schema
+def tables_info():
+ """Basic Tables with a single table containing genomic columns."""
+ tables = Tables()
+ tables.register("variants", Table("variants"))
+ return tables
@pytest.fixture
-def schema_with_two_tables():
- """SchemaInfo with two tables for column-to-column tests."""
- schema = SchemaInfo()
-
- # Table A
- table_a = TableSchema(name="features_a", columns={})
- table_a.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- table_a.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["features_a"] = table_a
-
- # Table B
- table_b = TableSchema(name="features_b", columns={})
- table_b.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- table_b.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["features_b"] = table_b
-
- return schema
+def tables_with_two_tables():
+ """Tables with two tables for column-to-column tests."""
+ tables = Tables()
+ tables.register("features_a", Table("features_a"))
+ tables.register("features_b", Table("features_b"))
+ return tables
@pytest.fixture
-def schema_with_closed_intervals():
- """SchemaInfo with CLOSED interval type for bedtools compatibility tests."""
- schema = SchemaInfo()
- table = TableSchema(name="bed_features", columns={})
- table.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- interval_type=IntervalType.CLOSED,
- )
- schema.tables["bed_features"] = table
- return schema
+def tables_with_closed_intervals():
+ """Tables with CLOSED interval type for bedtools compatibility tests."""
+ tables = Tables()
+ tables.register("bed_features", Table("bed_features", interval_type="closed"))
+ return tables
@pytest.fixture
-def schema_with_peaks_and_genes():
- """Schema info with peaks and genes tables for NEAREST tests."""
- schema = SchemaInfo()
-
- # Register peaks table
- peaks_table = TableSchema(name="peaks", columns={})
- peaks_table.columns["peak_id"] = ColumnInfo(name="peak_id", type="INTEGER")
- peaks_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["peaks"] = peaks_table
-
- # Register genes table
- genes_table = TableSchema(name="genes", columns={})
- genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER")
- genes_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR")
- genes_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["genes"] = genes_table
-
- return schema
+def tables_with_peaks_and_genes():
+ """Tables with peaks and genes tables for NEAREST tests."""
+ tables = Tables()
+ tables.register("peaks", Table("peaks"))
+ tables.register("genes", Table("genes"))
+ return tables
class TestBaseGIQLGenerator:
@@ -137,26 +57,26 @@ class TestBaseGIQLGenerator:
def test_instantiation_defaults(self):
"""
- GIVEN no schema_info provided
+ GIVEN no tables provided
WHEN Generator is instantiated with defaults
- THEN Generator has empty SchemaInfo and SUPPORTS_LATERAL is True.
+ THEN Generator has empty Tables and SUPPORTS_LATERAL is True.
"""
generator = BaseGIQLGenerator()
- assert generator.schema_info is not None
- assert generator.schema_info.tables == {}
+ assert generator.tables is not None
+ assert "variants" not in generator.tables
assert generator.SUPPORTS_LATERAL is True
- def test_instantiation_with_schema(self, schema_info):
+ def test_instantiation_with_tables(self, tables_info):
"""
- GIVEN a valid SchemaInfo object with table definitions
- WHEN Generator is instantiated with schema_info
- THEN Generator stores schema_info and can resolve column references.
+ GIVEN a valid Tables object with table definitions
+ WHEN Generator is instantiated with tables
+ THEN Generator stores tables and can resolve column references.
"""
- generator = BaseGIQLGenerator(schema_info=schema_info)
+ generator = BaseGIQLGenerator(tables=tables_info)
- assert generator.schema_info is schema_info
- assert "variants" in generator.schema_info.tables
+ assert generator.tables is tables_info
+ assert "variants" in generator.tables
def test_instantiation_kwargs_forwarding(self):
"""
@@ -170,7 +90,7 @@ def test_instantiation_kwargs_forwarding(self):
# If kwargs forwarding works, generator should have pretty attribute
assert generator.pretty is True
- def test_select_sql_basic(self, schema_info):
+ def test_select_sql_basic(self, tables_info):
"""
GIVEN a SELECT expression with FROM clause containing a table
WHEN select_sql is called
@@ -179,13 +99,13 @@ def test_select_sql_basic(self, schema_info):
sql = "SELECT * FROM variants"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_info)
+ generator = BaseGIQLGenerator(tables=tables_info)
output = generator.generate(ast)
expected = "SELECT * FROM variants"
assert output == expected
- def test_select_sql_with_alias(self, schema_info):
+ def test_select_sql_with_alias(self, tables_info):
"""
GIVEN a SELECT with aliased table (e.g., FROM table AS t)
WHEN select_sql is called
@@ -194,17 +114,17 @@ def test_select_sql_with_alias(self, schema_info):
sql = "SELECT * FROM variants AS v WHERE v.interval INTERSECTS 'chr1:1000-2000'"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_info)
+ generator = BaseGIQLGenerator(tables=tables_info)
output = generator.generate(ast)
expected = (
"SELECT * FROM variants AS v WHERE "
- '(v."chromosome" = \'chr1\' AND v."start_pos" < 2000 '
- 'AND v."end_pos" > 1000)'
+ '(v."chrom" = \'chr1\' AND v."start" < 2000 '
+ 'AND v."end" > 1000)'
)
assert output == expected
- def test_select_sql_with_joins(self, schema_with_two_tables):
+ def test_select_sql_with_joins(self, tables_with_two_tables):
"""
GIVEN a SELECT with JOINs
WHEN select_sql is called
@@ -213,7 +133,7 @@ def test_select_sql_with_joins(self, schema_with_two_tables):
sql = "SELECT * FROM features_a AS a JOIN features_b AS b ON a.id = b.id"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = "SELECT * FROM features_a AS a JOIN features_b AS b ON a.id = b.id"
@@ -233,11 +153,11 @@ def test_intersects_sql_with_literal(self):
expected = (
"SELECT * FROM variants WHERE "
- '("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000)'
+ '("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000)'
)
assert output == expected
- def test_intersects_sql_column_join(self, schema_with_two_tables):
+ def test_intersects_sql_column_join(self, tables_with_two_tables):
"""
GIVEN an Intersects expression with column-to-column
(a.interval INTERSECTS b.interval)
@@ -250,13 +170,13 @@ def test_intersects_sql_column_join(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
"SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE "
- '(a."chromosome" = b."chromosome" AND a."start_pos" < b."end_pos" '
- 'AND a."end_pos" > b."start_pos")'
+ '(a."chrom" = b."chrom" AND a."start" < b."end" '
+ 'AND a."end" > b."start")'
)
assert output == expected
@@ -297,7 +217,7 @@ def test_contains_sql_point_query(self):
expected = (
"SELECT * FROM variants WHERE "
- '("chromosome" = \'chr1\' AND "start_pos" <= 1500 AND "end_pos" > 1500)'
+ '("chrom" = \'chr1\' AND "start" <= 1500 AND "end" > 1500)'
)
assert output == expected
@@ -315,12 +235,12 @@ def test_contains_sql_range_query(self):
expected = (
"SELECT * FROM variants WHERE "
- '("chromosome" = \'chr1\' AND "start_pos" <= 1500 '
- 'AND "end_pos" >= 2000)'
+ '("chrom" = \'chr1\' AND "start" <= 1500 '
+ 'AND "end" >= 2000)'
)
assert output == expected
- def test_contains_sql_column_join(self, schema_with_two_tables):
+ def test_contains_sql_column_join(self, tables_with_two_tables):
"""
GIVEN a Contains expression with column-to-column join
WHEN contains_sql is called
@@ -332,13 +252,13 @@ def test_contains_sql_column_join(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
"SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE "
- '(a."chromosome" = b."chromosome" '
- 'AND a."start_pos" <= b."start_pos" AND a."end_pos" >= b."end_pos")'
+ '(a."chrom" = b."chrom" '
+ 'AND a."start" <= b."start" AND a."end" >= b."end")'
)
assert output == expected
@@ -382,11 +302,11 @@ def test_within_sql_with_literal(self):
expected = (
"SELECT * FROM variants WHERE "
- '("chromosome" = \'chr1\' AND "start_pos" >= 1000 AND "end_pos" <= 5000)'
+ '("chrom" = \'chr1\' AND "start" >= 1000 AND "end" <= 5000)'
)
assert output == expected
- def test_within_sql_column_join(self, schema_with_two_tables):
+ def test_within_sql_column_join(self, tables_with_two_tables):
"""
GIVEN a Within expression with column-to-column join
WHEN within_sql is called
@@ -398,13 +318,13 @@ def test_within_sql_column_join(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
"SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE "
- '(a."chromosome" = b."chromosome" '
- 'AND a."start_pos" >= b."start_pos" AND a."end_pos" <= b."end_pos")'
+ '(a."chrom" = b."chrom" '
+ 'AND a."start" >= b."start" AND a."end" <= b."end")'
)
assert output == expected
@@ -425,8 +345,8 @@ def test_spatialsetpredicate_sql_any(self):
expected = (
"SELECT * FROM variants WHERE "
- '(("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000) '
- 'OR ("chromosome" = \'chr1\' AND "start_pos" < 6000 AND "end_pos" > 5000))'
+ '(("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000) '
+ 'OR ("chrom" = \'chr1\' AND "start" < 6000 AND "end" > 5000))'
)
assert output == expected
@@ -447,12 +367,12 @@ def test_spatialsetpredicate_sql_all(self):
expected = (
"SELECT * FROM variants WHERE "
- '(("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000) '
- 'AND ("chromosome" = \'chr1\' AND "start_pos" < 1800 AND "end_pos" > 1500))'
+ '(("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000) '
+ 'AND ("chrom" = \'chr1\' AND "start" < 1800 AND "end" > 1500))'
)
assert output == expected
- def test_giqlnearest_sql_standalone(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_standalone(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest in standalone mode with literal reference
WHEN giqlnearest_sql is called
@@ -461,31 +381,31 @@ def test_giqlnearest_sql_standalone(self, schema_with_peaks_and_genes):
sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
expected = (
"SELECT * FROM (\n"
" SELECT genes.*, "
- "CASE WHEN 'chr1' != genes.\"chromosome\" THEN NULL "
- 'WHEN 1000 < genes."end_pos" AND 2000 > genes."start_pos" THEN 0 '
- 'WHEN 2000 <= genes."start_pos" '
- 'THEN (genes."start_pos" - 2000) '
- 'ELSE (1000 - genes."end_pos") END AS distance\n'
+ "CASE WHEN 'chr1' != genes.\"chrom\" THEN NULL "
+ 'WHEN 1000 < genes."end" AND 2000 > genes."start" THEN 0 '
+ 'WHEN 2000 <= genes."start" '
+ 'THEN (genes."start" - 2000) '
+ 'ELSE (1000 - genes."end") END AS distance\n'
" FROM genes\n"
- " WHERE 'chr1' = genes.\"chromosome\"\n"
+ " WHERE 'chr1' = genes.\"chrom\"\n"
" ORDER BY ABS("
- "CASE WHEN 'chr1' != genes.\"chromosome\" THEN NULL "
- 'WHEN 1000 < genes."end_pos" AND 2000 > genes."start_pos" THEN 0 '
- 'WHEN 2000 <= genes."start_pos" '
- 'THEN (genes."start_pos" - 2000) '
- 'ELSE (1000 - genes."end_pos") END)\n'
+ "CASE WHEN 'chr1' != genes.\"chrom\" THEN NULL "
+ 'WHEN 1000 < genes."end" AND 2000 > genes."start" THEN 0 '
+ 'WHEN 2000 <= genes."start" '
+ 'THEN (genes."start" - 2000) '
+ 'ELSE (1000 - genes."end") END)\n'
" LIMIT 3\n"
" )"
)
assert output == expected
- def test_giqlnearest_sql_correlated(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_correlated(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest in correlated mode (LATERAL join context)
WHEN giqlnearest_sql is called
@@ -497,33 +417,33 @@ def test_giqlnearest_sql_correlated(self, schema_with_peaks_and_genes):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
expected = (
"SELECT * FROM peaks CROSS JOIN LATERAL (\n"
" SELECT genes.*, "
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END AS distance\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE (peaks."start" - genes."end") END AS distance\n'
" FROM genes\n"
- ' WHERE peaks."chromosome" = genes."chromosome"\n'
+ ' WHERE peaks."chrom" = genes."chrom"\n'
" ORDER BY ABS("
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END)\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE (peaks."start" - genes."end") END)\n'
" LIMIT 3\n"
" )"
)
assert output == expected
- def test_giqlnearest_sql_with_max_distance(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_with_max_distance(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest with max_distance parameter
WHEN giqlnearest_sql is called
@@ -536,40 +456,40 @@ def test_giqlnearest_sql_with_max_distance(self, schema_with_peaks_and_genes):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
expected = (
"SELECT * FROM peaks CROSS JOIN LATERAL (\n"
" SELECT genes.*, "
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END AS distance\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE (peaks."start" - genes."end") END AS distance\n'
" FROM genes\n"
- ' WHERE peaks."chromosome" = genes."chromosome" '
+ ' WHERE peaks."chrom" = genes."chrom" '
"AND (ABS("
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END)) <= 100000\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE (peaks."start" - genes."end") END)) <= 100000\n'
" ORDER BY ABS("
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END)\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE (peaks."start" - genes."end") END)\n'
" LIMIT 5\n"
" )"
)
assert output == expected
- def test_giqlnearest_sql_stranded(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_stranded(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest with stranded=True
WHEN giqlnearest_sql is called
@@ -582,48 +502,48 @@ def test_giqlnearest_sql_stranded(self, schema_with_peaks_and_genes):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
expected = (
"SELECT * FROM peaks CROSS JOIN LATERAL (\n"
" SELECT genes.*, "
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
'WHEN peaks."strand" IS NULL OR genes."strand" IS NULL THEN NULL '
"WHEN peaks.\"strand\" = '.' OR peaks.\"strand\" = '?' THEN NULL "
"WHEN genes.\"strand\" = '.' OR genes.\"strand\" = '?' THEN NULL "
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
"THEN CASE WHEN peaks.\"strand\" = '-' "
- 'THEN -(genes."start_pos" - peaks."end_pos") '
- 'ELSE (genes."start_pos" - peaks."end_pos") END '
+ 'THEN -(genes."start" - peaks."end") '
+ 'ELSE (genes."start" - peaks."end") END '
"ELSE CASE WHEN peaks.\"strand\" = '-' "
- 'THEN -(peaks."start_pos" - genes."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END END AS distance\n'
+ 'THEN -(peaks."start" - genes."end") '
+ 'ELSE (peaks."start" - genes."end") END END AS distance\n'
" FROM genes\n"
- ' WHERE peaks."chromosome" = genes."chromosome" '
+ ' WHERE peaks."chrom" = genes."chrom" '
'AND peaks."strand" = genes."strand"\n'
" ORDER BY ABS("
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
'WHEN peaks."strand" IS NULL OR genes."strand" IS NULL THEN NULL '
"WHEN peaks.\"strand\" = '.' OR peaks.\"strand\" = '?' THEN NULL "
"WHEN genes.\"strand\" = '.' OR genes.\"strand\" = '?' THEN NULL "
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
"THEN CASE WHEN peaks.\"strand\" = '-' "
- 'THEN -(genes."start_pos" - peaks."end_pos") '
- 'ELSE (genes."start_pos" - peaks."end_pos") END '
+ 'THEN -(genes."start" - peaks."end") '
+ 'ELSE (genes."start" - peaks."end") END '
"ELSE CASE WHEN peaks.\"strand\" = '-' "
- 'THEN -(peaks."start_pos" - genes."end_pos") '
- 'ELSE (peaks."start_pos" - genes."end_pos") END END)\n'
+ 'THEN -(peaks."start" - genes."end") '
+ 'ELSE (peaks."start" - genes."end") END END)\n'
" LIMIT 3\n"
" )"
)
assert output == expected
- def test_giqlnearest_sql_signed(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_signed(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest with signed=True
WHEN giqlnearest_sql is called
@@ -636,33 +556,33 @@ def test_giqlnearest_sql_signed(self, schema_with_peaks_and_genes):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
expected = (
"SELECT * FROM peaks CROSS JOIN LATERAL (\n"
" SELECT genes.*, "
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE -(peaks."start_pos" - genes."end_pos") END AS distance\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE -(peaks."start" - genes."end") END AS distance\n'
" FROM genes\n"
- ' WHERE peaks."chromosome" = genes."chromosome"\n'
+ ' WHERE peaks."chrom" = genes."chrom"\n'
" ORDER BY ABS("
- 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL '
- 'WHEN peaks."start_pos" < genes."end_pos" '
- 'AND peaks."end_pos" > genes."start_pos" THEN 0 '
- 'WHEN peaks."end_pos" <= genes."start_pos" '
- 'THEN (genes."start_pos" - peaks."end_pos") '
- 'ELSE -(peaks."start_pos" - genes."end_pos") END)\n'
+ 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL '
+ 'WHEN peaks."start" < genes."end" '
+ 'AND peaks."end" > genes."start" THEN 0 '
+ 'WHEN peaks."end" <= genes."start" '
+ 'THEN (genes."start" - peaks."end") '
+ 'ELSE -(peaks."start" - genes."end") END)\n'
" LIMIT 3\n"
" )"
)
assert output == expected
- def test_giqlnearest_sql_no_lateral_support(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_no_lateral_support(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest on a generator with SUPPORTS_LATERAL=False
WHEN giqlnearest_sql is called in correlated mode
@@ -677,7 +597,7 @@ class NoLateralGenerator(BaseGIQLGenerator):
sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = NoLateralGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = NoLateralGenerator(tables=tables_with_peaks_and_genes)
with pytest.raises(ValueError, match="LATERAL"):
generator.generate(ast)
@@ -688,7 +608,7 @@ class NoLateralGenerator(BaseGIQLGenerator):
max_distance=st.integers(min_value=1, max_value=10_000_000),
)
def test_giqlnearest_sql_parameter_handling_property(
- self, schema_with_peaks_and_genes, k, max_distance
+ self, tables_with_peaks_and_genes, k, max_distance
):
"""
GIVEN any valid k value (positive integer) and max_distance
@@ -701,7 +621,7 @@ def test_giqlnearest_sql_parameter_handling_property(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# k should appear in LIMIT
@@ -709,7 +629,7 @@ def test_giqlnearest_sql_parameter_handling_property(
# max_distance should appear in WHERE
assert str(max_distance) in output
- def test_giqldistance_sql_basic(self, schema_with_two_tables):
+ def test_giqldistance_sql_basic(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with two column references
WHEN giqldistance_sql is called
@@ -721,20 +641,20 @@ def test_giqldistance_sql_basic(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
- 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" '
- 'THEN 0 WHEN a."end_pos" <= b."start_pos" '
- 'THEN (b."start_pos" - a."end_pos") '
- 'ELSE (a."start_pos" - b."end_pos") END AS dist '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
+ 'WHEN a."start" < b."end" AND a."end" > b."start" '
+ 'THEN 0 WHEN a."end" <= b."start" '
+ 'THEN (b."start" - a."end") '
+ 'ELSE (a."start" - b."end") END AS dist '
"FROM features_a AS a CROSS JOIN features_b AS b"
)
assert output == expected
- def test_giqldistance_sql_stranded(self, schema_with_two_tables):
+ def test_giqldistance_sql_stranded(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with stranded=True
WHEN giqldistance_sql is called
@@ -746,28 +666,28 @@ def test_giqldistance_sql_stranded(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
'WHEN a."strand" IS NULL OR b."strand" IS NULL THEN NULL '
"WHEN a.\"strand\" = '.' OR a.\"strand\" = '?' THEN NULL "
"WHEN b.\"strand\" = '.' OR b.\"strand\" = '?' THEN NULL "
- 'WHEN a."start_pos" < b."end_pos" '
- 'AND a."end_pos" > b."start_pos" THEN 0 '
- 'WHEN a."end_pos" <= b."start_pos" '
+ 'WHEN a."start" < b."end" '
+ 'AND a."end" > b."start" THEN 0 '
+ 'WHEN a."end" <= b."start" '
"THEN CASE WHEN a.\"strand\" = '-' "
- 'THEN -(b."start_pos" - a."end_pos") '
- 'ELSE (b."start_pos" - a."end_pos") END '
+ 'THEN -(b."start" - a."end") '
+ 'ELSE (b."start" - a."end") END '
"ELSE CASE WHEN a.\"strand\" = '-' "
- 'THEN -(a."start_pos" - b."end_pos") '
- 'ELSE (a."start_pos" - b."end_pos") END END AS dist '
+ 'THEN -(a."start" - b."end") '
+ 'ELSE (a."start" - b."end") END END AS dist '
"FROM features_a AS a CROSS JOIN features_b AS b"
)
assert output == expected
- def test_giqldistance_sql_signed(self, schema_with_two_tables):
+ def test_giqldistance_sql_signed(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with signed=True
WHEN giqldistance_sql is called
@@ -779,20 +699,20 @@ def test_giqldistance_sql_signed(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
- 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" '
- 'THEN 0 WHEN a."end_pos" <= b."start_pos" '
- 'THEN (b."start_pos" - a."end_pos") '
- 'ELSE -(a."start_pos" - b."end_pos") END AS dist '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
+ 'WHEN a."start" < b."end" AND a."end" > b."start" '
+ 'THEN 0 WHEN a."end" <= b."start" '
+ 'THEN (b."start" - a."end") '
+ 'ELSE -(a."start" - b."end") END AS dist '
"FROM features_a AS a CROSS JOIN features_b AS b"
)
assert output == expected
- def test_giqldistance_sql_stranded_and_signed(self, schema_with_two_tables):
+ def test_giqldistance_sql_stranded_and_signed(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with both stranded and signed=True
WHEN giqldistance_sql is called
@@ -805,48 +725,37 @@ def test_giqldistance_sql_stranded_and_signed(self, schema_with_two_tables):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
'WHEN a."strand" IS NULL OR b."strand" IS NULL THEN NULL '
"WHEN a.\"strand\" = '.' OR a.\"strand\" = '?' THEN NULL "
"WHEN b.\"strand\" = '.' OR b.\"strand\" = '?' THEN NULL "
- 'WHEN a."start_pos" < b."end_pos" '
- 'AND a."end_pos" > b."start_pos" THEN 0 '
- 'WHEN a."end_pos" <= b."start_pos" '
+ 'WHEN a."start" < b."end" '
+ 'AND a."end" > b."start" THEN 0 '
+ 'WHEN a."end" <= b."start" '
"THEN CASE WHEN a.\"strand\" = '-' "
- 'THEN -(b."start_pos" - a."end_pos") '
- 'ELSE (b."start_pos" - a."end_pos") END '
+ 'THEN -(b."start" - a."end") '
+ 'ELSE (b."start" - a."end") END '
"ELSE CASE WHEN a.\"strand\" = '-' "
- 'THEN (a."start_pos" - b."end_pos") '
- 'ELSE -(a."start_pos" - b."end_pos") END END AS dist '
+ 'THEN (a."start" - b."end") '
+ 'ELSE -(a."start" - b."end") END END AS dist '
"FROM features_a AS a CROSS JOIN features_b AS b"
)
assert output == expected
- def test_giqldistance_with_closed_intervals(self, schema_with_closed_intervals):
+ def test_giqldistance_with_closed_intervals(self, tables_with_closed_intervals):
"""
GIVEN intervals from table with CLOSED interval type
WHEN Distance calculation is performed
THEN Distance includes +1 adjustment (bedtools compatibility).
"""
- # Create a second table with closed intervals for distance calculation
- schema = schema_with_closed_intervals
- table_b = TableSchema(name="bed_features_b", columns={})
- table_b.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- table_b.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- interval_type=IntervalType.CLOSED,
+ # Add a second table with closed intervals for distance calculation
+ tables_with_closed_intervals.register(
+ "bed_features_b", Table("bed_features_b", interval_type="closed")
)
- schema.tables["bed_features_b"] = table_b
sql = (
"SELECT DISTANCE(a.interval, b.interval) as dist "
@@ -854,16 +763,16 @@ def test_giqldistance_with_closed_intervals(self, schema_with_closed_intervals):
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema)
+ generator = BaseGIQLGenerator(tables=tables_with_closed_intervals)
output = generator.generate(ast)
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
- 'WHEN a."start_pos" < b."end_pos" '
- 'AND a."end_pos" > b."start_pos" THEN 0 '
- 'WHEN a."end_pos" <= b."start_pos" '
- 'THEN (b."start_pos" - a."end_pos" + 1) '
- 'ELSE (a."start_pos" - b."end_pos" + 1) END AS dist '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
+ 'WHEN a."start" < b."end" '
+ 'AND a."end" > b."start" THEN 0 '
+ 'WHEN a."end" <= b."start" '
+ 'THEN (b."start" - a."end" + 1) '
+ 'ELSE (a."start" - b."end" + 1) END AS dist '
"FROM bed_features AS a CROSS JOIN bed_features_b AS b"
)
assert output == expected
@@ -901,7 +810,7 @@ def test_error_handling_unknown_operation(self):
with pytest.raises(ValueError):
generator.generate(ast)
- def test_select_sql_join_without_alias(self, schema_with_two_tables):
+ def test_select_sql_join_without_alias(self, tables_with_two_tables):
"""
GIVEN a SELECT with JOIN where joined table has no alias
WHEN select_sql is called
@@ -910,7 +819,7 @@ def test_select_sql_join_without_alias(self, schema_with_two_tables):
sql = "SELECT * FROM features_a JOIN features_b ON features_a.id = features_b.id"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
expected = (
@@ -919,7 +828,7 @@ def test_select_sql_join_without_alias(self, schema_with_two_tables):
assert output == expected
def test_giqlnearest_sql_stranded_literal_with_strand(
- self, schema_with_peaks_and_genes
+ self, tables_with_peaks_and_genes
):
"""
GIVEN a GIQLNearest with stranded=True and literal reference containing strand
@@ -932,7 +841,7 @@ def test_giqlnearest_sql_stranded_literal_with_strand(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Should contain strand literal '+' and strand filtering
@@ -940,7 +849,7 @@ def test_giqlnearest_sql_stranded_literal_with_strand(
assert 'genes."strand"' in output
def test_giqlnearest_sql_stranded_implicit_reference(
- self, schema_with_peaks_and_genes
+ self, tables_with_peaks_and_genes
):
"""
GIVEN a GIQLNearest in correlated mode with implicit reference and stranded=True
@@ -950,7 +859,7 @@ def test_giqlnearest_sql_stranded_implicit_reference(
sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3, stranded=true)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Should have strand columns from both tables
@@ -963,31 +872,19 @@ def test_giqlnearest_sql_closed_intervals(self):
WHEN giqlnearest_sql is called
THEN Distance calculation includes +1 adjustment for bedtools compatibility.
"""
- schema = SchemaInfo()
- genes_closed = TableSchema(name="genes_closed", columns={})
- genes_closed.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER")
- genes_closed.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- interval_type=IntervalType.CLOSED,
- )
- schema.tables["genes_closed"] = genes_closed
+ tables = Tables()
+ tables.register("genes_closed", Table("genes_closed", interval_type="closed"))
sql = "SELECT * FROM NEAREST(genes_closed, reference='chr1:1000-2000', k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema)
+ generator = BaseGIQLGenerator(tables=tables)
output = generator.generate(ast)
# Should have +1 adjustment for closed intervals
assert "+ 1)" in output
- def test_giqldistance_sql_literal_first_arg_error(self, schema_with_two_tables):
+ def test_giqldistance_sql_literal_first_arg_error(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with literal range as first argument
WHEN giqldistance_sql is called
@@ -996,12 +893,12 @@ def test_giqldistance_sql_literal_first_arg_error(self, schema_with_two_tables):
sql = "SELECT DISTANCE('chr1:1000-2000', b.interval) as dist FROM features_b b"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
with pytest.raises(ValueError, match="Literal range as first argument"):
generator.generate(ast)
- def test_giqldistance_sql_literal_second_arg_error(self, schema_with_two_tables):
+ def test_giqldistance_sql_literal_second_arg_error(self, tables_with_two_tables):
"""
GIVEN a GIQLDistance with literal range as second argument
WHEN giqldistance_sql is called
@@ -1010,13 +907,13 @@ def test_giqldistance_sql_literal_second_arg_error(self, schema_with_two_tables)
sql = "SELECT DISTANCE(a.interval, 'chr1:1000-2000') as dist FROM features_a a"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
with pytest.raises(ValueError, match="Literal range as second argument"):
generator.generate(ast)
def test_giqlnearest_sql_missing_outer_table_error(
- self, schema_with_peaks_and_genes
+ self, tables_with_peaks_and_genes
):
"""
GIVEN a GIQLNearest in correlated mode without reference where outer table
@@ -1030,82 +927,33 @@ def test_giqlnearest_sql_missing_outer_table_error(
k=exp.Literal.number(3),
)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
with pytest.raises(ValueError, match="Could not find outer table"):
generator.giqlnearest_sql(nearest)
- def test_giqlnearest_sql_outer_table_not_in_schema(self):
+ def test_giqlnearest_sql_outer_table_not_in_tables(self):
"""
- GIVEN a GIQLNearest in correlated mode where outer table is not in schema
+ GIVEN a GIQLNearest in correlated mode where outer table is not registered
WHEN giqlnearest_sql is called
THEN ValueError is raised listing the issue.
"""
- schema = SchemaInfo()
- genes_table = TableSchema(name="genes", columns={})
- genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER")
- genes_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["genes"] = genes_table
+ tables = Tables()
+ tables.register("genes", Table("genes"))
nearest = GIQLNearest(
this=exp.Table(this=exp.Identifier(this="genes")),
k=exp.Literal.number(3),
)
- generator = BaseGIQLGenerator(schema_info=schema)
+ generator = BaseGIQLGenerator(tables=tables)
generator._alias_to_table = {"unknown_table": "unknown_table"}
generator._find_outer_table_in_lateral_join = lambda x: "unknown_table"
- with pytest.raises(ValueError, match="not found in schema"):
+ with pytest.raises(ValueError, match="not found in tables"):
generator.giqlnearest_sql(nearest)
- def test_giqlnearest_sql_no_genomic_column_in_outer(self):
- """
- GIVEN a GIQLNearest in correlated mode where outer table has no genomic column
- WHEN giqlnearest_sql is called
- THEN ValueError is raised about missing genomic column.
- """
- schema = SchemaInfo()
-
- outer_table = TableSchema(name="outer_table", columns={})
- outer_table.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- outer_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR")
- schema.tables["outer_table"] = outer_table
-
- genes_table = TableSchema(name="genes", columns={})
- genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER")
- genes_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["genes"] = genes_table
-
- nearest = GIQLNearest(
- this=exp.Table(this=exp.Identifier(this="genes")),
- k=exp.Literal.number(3),
- )
-
- generator = BaseGIQLGenerator(schema_info=schema)
- generator._alias_to_table = {"outer_table": "outer_table"}
- generator._find_outer_table_in_lateral_join = lambda x: "outer_table"
-
- with pytest.raises(ValueError, match="No genomic column found"):
- generator.giqlnearest_sql(nearest)
-
- def test_giqlnearest_sql_invalid_reference_range(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_invalid_reference_range(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest with invalid/unparseable reference range string
WHEN giqlnearest_sql is called
@@ -1114,58 +962,38 @@ def test_giqlnearest_sql_invalid_reference_range(self, schema_with_peaks_and_gen
sql = "SELECT * FROM NEAREST(genes, reference='invalid_range', k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
with pytest.raises(ValueError, match="Could not parse reference genomic range"):
generator.generate(ast)
- def test_giqlnearest_sql_no_schema_error(self):
+ def test_giqlnearest_sql_no_tables_error(self):
"""
- GIVEN a GIQLNearest without schema_info provided (empty schema)
+ GIVEN a GIQLNearest without tables registered
WHEN giqlnearest_sql is called
THEN ValueError is raised because target table cannot be resolved.
"""
sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- # Generator with empty schema - table won't be found
+ # Generator with empty tables - table won't be found
generator = BaseGIQLGenerator()
- with pytest.raises(ValueError, match="not found in schema"):
+ with pytest.raises(ValueError, match="not found in tables"):
generator.generate(ast)
- def test_giqlnearest_sql_target_not_in_schema(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_target_not_in_tables(self, tables_with_peaks_and_genes):
"""
- GIVEN a GIQLNearest with target table not found in schema
+ GIVEN a GIQLNearest with target table not registered
WHEN giqlnearest_sql is called
THEN ValueError is raised listing available tables.
"""
sql = "SELECT * FROM NEAREST(unknown_table, reference='chr1:1000-2000', k=3)"
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
-
- with pytest.raises(ValueError, match="not found in schema"):
- generator.generate(ast)
-
- def test_giqlnearest_sql_target_no_genomic_column(self):
- """
- GIVEN a GIQLNearest with target table having no genomic column defined
- WHEN giqlnearest_sql is called
- THEN ValueError is raised about missing genomic column.
- """
- schema = SchemaInfo()
- no_genomic_table = TableSchema(name="no_genomic", columns={})
- no_genomic_table.columns["id"] = ColumnInfo(name="id", type="INTEGER")
- no_genomic_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR")
- schema.tables["no_genomic"] = no_genomic_table
-
- sql = "SELECT * FROM NEAREST(no_genomic, reference='chr1:1000-2000', k=3)"
- ast = parse_one(sql, dialect=GIQLDialect)
-
- generator = BaseGIQLGenerator(schema_info=schema)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
- with pytest.raises(ValueError, match="does not have a genomic column"):
+ with pytest.raises(ValueError, match="not found in tables"):
generator.generate(ast)
def test_intersects_sql_unqualified_column(self):
@@ -1182,12 +1010,12 @@ def test_intersects_sql_unqualified_column(self):
expected = (
"SELECT * FROM variants WHERE "
- '("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000)'
+ '("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000)'
)
assert output == expected
def test_giqlnearest_sql_stranded_unqualified_reference(
- self, schema_with_peaks_and_genes
+ self, tables_with_peaks_and_genes
):
"""
GIVEN a GIQLNearest with stranded=True and unqualified column reference
@@ -1204,7 +1032,7 @@ def test_giqlnearest_sql_stranded_unqualified_reference(
stranded=exp.Boolean(this=True),
)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.giqlnearest_sql(nearest)
# Should produce valid output with unqualified strand column
@@ -1212,7 +1040,7 @@ def test_giqlnearest_sql_stranded_unqualified_reference(
# The strand column should be unqualified (no table prefix)
assert '"strand"' in output
- def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes):
+ def test_giqlnearest_sql_identifier_target(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQLNearest where target is an Identifier (not Table or Column)
WHEN giqlnearest_sql is called
@@ -1227,7 +1055,7 @@ def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes):
k=exp.Literal.number(3),
)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.giqlnearest_sql(nearest)
# Should succeed and produce valid SQL
@@ -1239,7 +1067,7 @@ def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes):
)
@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
def test_giqldistance_stranded_param_truthy_values_property(
- self, schema_with_two_tables, bool_repr
+ self, tables_with_two_tables, bool_repr
):
"""
GIVEN a GIQLDistance with stranded parameter in various truthy representations
@@ -1252,7 +1080,7 @@ def test_giqldistance_stranded_param_truthy_values_property(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
# Should include strand handling (NULL checks for strand columns)
@@ -1264,7 +1092,7 @@ def test_giqldistance_stranded_param_truthy_values_property(
)
@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
def test_giqldistance_stranded_param_falsy_values_property(
- self, schema_with_two_tables, bool_repr
+ self, tables_with_two_tables, bool_repr
):
"""
GIVEN a GIQLDistance with stranded parameter in various falsy representations
@@ -1277,7 +1105,7 @@ def test_giqldistance_stranded_param_falsy_values_property(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
# Should NOT include strand NULL checks (basic distance)
@@ -1288,7 +1116,7 @@ def test_giqldistance_stranded_param_falsy_values_property(
)
@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
def test_giqldistance_signed_param_truthy_values_property(
- self, schema_with_two_tables, bool_repr
+ self, tables_with_two_tables, bool_repr
):
"""
GIVEN a GIQLDistance with signed parameter in various truthy representations
@@ -1301,7 +1129,7 @@ def test_giqldistance_signed_param_truthy_values_property(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
# Signed distance has negative sign for upstream intervals
@@ -1312,7 +1140,7 @@ def test_giqldistance_signed_param_truthy_values_property(
)
@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
def test_giqldistance_signed_param_falsy_values_property(
- self, schema_with_two_tables, bool_repr
+ self, tables_with_two_tables, bool_repr
):
"""
GIVEN a GIQLDistance with signed parameter in various falsy representations
@@ -1325,7 +1153,7 @@ def test_giqldistance_signed_param_falsy_values_property(
)
ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_two_tables)
+ generator = BaseGIQLGenerator(tables=tables_with_two_tables)
output = generator.generate(ast)
# Unsigned distance has no negative sign (both ELSE branches are positive)
diff --git a/tests/integration/bedtools/__init__.py b/tests/integration/bedtools/__init__.py
deleted file mode 100644
index 0a2c30b..0000000
--- a/tests/integration/bedtools/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Bedtools integration tests for GIQL.
-
-This package contains integration tests that validate GIQL query results
-against bedtools command outputs using simulated genomic datasets.
-"""
diff --git a/tests/integration/bedtools/conftest.py b/tests/integration/bedtools/conftest.py
deleted file mode 100644
index af9387f..0000000
--- a/tests/integration/bedtools/conftest.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Pytest fixtures for bedtools integration tests.
-
-This module provides shared fixtures for:
-- DuckDB connections
-- Interval generators
-"""
-
-import pytest
-
-from .utils.data_models import IntervalGeneratorConfig
-from .utils.interval_generator import IntervalGenerator
-
-
-@pytest.fixture(scope="function")
-def duckdb_connection():
- """Provide clean DuckDB connection for each test.
-
- Yields:
- DuckDB connection to in-memory database
-
- Note:
- Each test gets a fresh database with no shared state.
- Connection is automatically closed after test.
- """
- try:
- import duckdb
- except ImportError:
- pytest.skip("DuckDB not installed. Install with: pip install duckdb")
-
- conn = duckdb.connect(":memory:")
- yield conn
- conn.close()
-
-
-@pytest.fixture(scope="function")
-def interval_generator():
- """Provide configured interval generator.
-
- Returns:
- IntervalGenerator with deterministic seed
-
- Note:
- Uses seed=42 for reproducible test data.
- """
- config = IntervalGeneratorConfig(seed=42)
- return IntervalGenerator(config)
diff --git a/tests/integration/bedtools/test_intersect.py b/tests/integration/bedtools/test_intersect.py
deleted file mode 100644
index cfc4394..0000000
--- a/tests/integration/bedtools/test_intersect.py
+++ /dev/null
@@ -1,313 +0,0 @@
-"""Integration tests for GIQL INTERSECTS operator.
-
-These tests validate that GIQL's INTERSECTS operator produces identical
-results to bedtools intersect command.
-"""
-
-from giql import GIQLEngine
-
-from .utils.bed_export import load_intervals
-from .utils.bedtools_wrapper import intersect
-from .utils.comparison import compare_results
-from .utils.data_models import GenomicInterval
-
-
-def _setup_giql_engine(duckdb_connection):
- """Helper to set up GIQL engine with table schemas."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=False)
- engine.conn = duckdb_connection
-
- schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "BIGINT",
- "strand": "VARCHAR",
- }
-
- engine.register_table_schema("intervals_a", schema, genomic_column="interval")
- engine.register_table_schema("intervals_b", schema, genomic_column="interval")
-
- return engine
-
-
-def test_intersect_basic_overlap(duckdb_connection, interval_generator):
- """Test INTERSECTS predicate finds overlapping intervals.
-
- Given:
- Two tables with genomic intervals where some intervals overlap
- When:
- A GIQL query uses INTERSECTS predicate in WHERE clause
- Then:
- Results match bedtools intersect output exactly
- """
- # Arrange: Create overlapping intervals
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr1", 150, 250, "a2", 200, "+"),
- GenomicInterval("chr1", 300, 400, "a3", 150, "-"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 180, 220, "b1", 100, "+"),
- GenomicInterval("chr1", 350, 450, "b2", 200, "-"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_partial_overlap(duckdb_connection, interval_generator):
- """Test INTERSECTS with partially overlapping intervals.
-
- Given:
- Intervals with partial overlaps
- When:
- INTERSECTS query is executed
- Then:
- Results match bedtools partial overlap behavior
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 250, "a1", 100, "+"),
- GenomicInterval("chr1", 300, 400, "a2", 200, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 200, 350, "b1", 150, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_no_overlap(duckdb_connection, interval_generator):
- """Test INTERSECTS with non-overlapping intervals.
-
- Given:
- Two sets of intervals with no overlaps
- When:
- INTERSECTS query is executed
- Then:
- No results returned (matches bedtools empty output)
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 300, 400, "b1", 150, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_adjacent_intervals(duckdb_connection, interval_generator):
- """Test INTERSECTS with adjacent (touching) intervals.
-
- Given:
- Intervals that touch but don't overlap
- When:
- INTERSECTS query is executed
- Then:
- No results returned (adjacent != overlapping)
- """
- # Arrange: Adjacent intervals (end of a1 == start of b1)
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 200, 300, "b1", 150, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_multiple_chromosomes(duckdb_connection, interval_generator):
- """Test INTERSECTS across multiple chromosomes.
-
- Given:
- Intervals on different chromosomes
- When:
- INTERSECTS query is executed
- Then:
- Only same-chromosome overlaps are returned
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr2", 150, 250, "a2", 200, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 150, 250, "b1", 150, "+"),
- GenomicInterval("chr2", 200, 300, "b2", 100, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
diff --git a/tests/integration/bedtools/test_merge.py b/tests/integration/bedtools/test_merge.py
deleted file mode 100644
index 51fea31..0000000
--- a/tests/integration/bedtools/test_merge.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""Integration tests for GIQL MERGE operator.
-
-These tests validate that GIQL's MERGE operator produces identical
-results to bedtools merge command.
-"""
-
-from giql import GIQLEngine
-
-from .utils.bed_export import load_intervals
-from .utils.bedtools_wrapper import merge
-from .utils.comparison import compare_results
-from .utils.data_models import GenomicInterval
-
-
-def _setup_giql_engine(duckdb_connection):
- """Helper to set up GIQL engine with table schema."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=False)
- engine.conn = duckdb_connection
-
- schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "BIGINT",
- "strand": "VARCHAR",
- }
-
- engine.register_table_schema(
- "intervals",
- schema,
- genomic_column="interval",
- )
-
- return engine
-
-
-def test_merge_adjacent_intervals(duckdb_connection):
- """Test MERGE with adjacent intervals.
-
- Given:
- A set of adjacent intervals
- When:
- MERGE operator is applied
- Then:
- Adjacent intervals are merged into single intervals
- """
- # Arrange
- intervals = [
- GenomicInterval("chr1", 100, 200, "i1", 100, "+"),
- GenomicInterval("chr1", 200, 300, "i2", 150, "+"),
- GenomicInterval("chr1", 300, 400, "i3", 200, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = merge(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals]
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT MERGE(interval)
- FROM intervals
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_merge_overlapping_intervals(duckdb_connection):
- """Test MERGE with overlapping intervals.
-
- Given:
- A set of overlapping intervals
- When:
- MERGE operator is applied
- Then:
- Overlapping intervals are merged
- """
- # Arrange
- intervals = [
- GenomicInterval("chr1", 100, 250, "i1", 100, "+"),
- GenomicInterval("chr1", 200, 350, "i2", 150, "+"),
- GenomicInterval("chr1", 300, 400, "i3", 200, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = merge(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals]
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT MERGE(interval)
- FROM intervals
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_merge_separated_intervals(duckdb_connection):
- """Test MERGE with separated intervals.
-
- Given:
- Intervals with gaps between them
- When:
- MERGE operator is applied
- Then:
- Separated intervals remain separate
- """
- # Arrange
- intervals = [
- GenomicInterval("chr1", 100, 200, "i1", 100, "+"),
- GenomicInterval("chr1", 300, 400, "i2", 150, "+"),
- GenomicInterval("chr1", 500, 600, "i3", 200, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = merge(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals]
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT MERGE(interval)
- FROM intervals
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_merge_multiple_chromosomes(duckdb_connection):
- """Test MERGE across multiple chromosomes.
-
- Given:
- Intervals on different chromosomes
- When:
- MERGE operator is applied
- Then:
- Merging occurs per chromosome
- """
- # Arrange
- intervals = [
- GenomicInterval("chr1", 100, 200, "i1", 100, "+"),
- GenomicInterval("chr1", 180, 300, "i2", 150, "+"),
- GenomicInterval("chr2", 100, 200, "i3", 100, "+"),
- GenomicInterval("chr2", 180, 300, "i4", 150, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = merge(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals]
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT MERGE(interval)
- FROM intervals
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
diff --git a/tests/integration/bedtools/test_nearest.py b/tests/integration/bedtools/test_nearest.py
deleted file mode 100644
index 30505ac..0000000
--- a/tests/integration/bedtools/test_nearest.py
+++ /dev/null
@@ -1,468 +0,0 @@
-"""Integration tests for GIQL NEAREST operator.
-
-These tests validate that GIQL's NEAREST operator produces identical
-results to bedtools closest command.
-"""
-
-from giql import GIQLEngine
-
-from .utils.bed_export import load_intervals
-from .utils.bedtools_wrapper import closest
-from .utils.comparison import compare_results
-from .utils.data_models import GenomicInterval
-
-
-def _setup_giql_engine(duckdb_connection):
- """Helper to set up GIQL engine with table schemas."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=False)
- engine.conn = duckdb_connection
-
- schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "BIGINT",
- "strand": "VARCHAR",
- }
-
- engine.register_table_schema(
- "intervals_a",
- schema,
- genomic_column="interval",
- interval_type="closed", # Match bedtools distance calculation
- )
- engine.register_table_schema(
- "intervals_b",
- schema,
- genomic_column="interval",
- interval_type="closed", # Match bedtools distance calculation
- )
-
- return engine
-
-
-def test_nearest_non_overlapping(duckdb_connection):
- """Test NEAREST with non-overlapping intervals.
-
- Given:
- Two sets of non-overlapping intervals
- When:
- NEAREST operator is applied
- Then:
- Each interval in A finds its closest neighbor in B
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr1", 500, 600, "a2", 150, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 250, 300, "b1", 100, "+"),
- GenomicInterval("chr1", 350, 400, "b2", 150, "+"),
- GenomicInterval("chr1", 700, 800, "b3", 200, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_nearest_multiple_candidates(duckdb_connection):
- """Test NEAREST with equidistant intervals.
-
- Given:
- Interval in A with multiple equidistant intervals in B
- When:
- NEAREST operator is applied
- Then:
- Bedtools reports one of the equidistant intervals (tie-breaking behavior)
- """
- # Arrange: a1 is equidistant from b1 and b2
- intervals_a = [
- GenomicInterval("chr1", 300, 400, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Distance: 100 bp
- GenomicInterval("chr1", 500, 600, "b2", 150, "+"), # Distance: 100 bp
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results (allowing tie-breaking differences)
- assert len(giql_result) == len(bedtools_result)
- # The nearest interval is either b1 or b2 (both equidistant)
- assert giql_result[0][3] == "a1" # Interval A name
- assert giql_result[0][9] in ("b1", "b2") # Nearest could be either
-
-
-def test_nearest_cross_chromosome(duckdb_connection):
- """Test NEAREST across multiple chromosomes.
-
- Given:
- Intervals on different chromosomes
- When:
- NEAREST operator is applied
- Then:
- Each interval finds nearest only on same chromosome
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr2", 100, 200, "a2", 150, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 300, 400, "b1", 100, "+"),
- GenomicInterval("chr2", 300, 400, "b2", 150, "+"),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_nearest_boundary_cases(duckdb_connection):
- """Test NEAREST with boundary cases.
-
- Given:
- Adjacent intervals (touching but not overlapping)
- When:
- NEAREST operator is applied
- Then:
- Adjacent intervals are reported as nearest (distance = 0)
- """
- # Arrange: a1 ends where b1 starts (adjacent, distance = 0)
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 200, 300, "b1", 150, "+"), # Adjacent to a1
- GenomicInterval("chr1", 500, 600, "b2", 200, "+"), # Far away
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_nearest_signed_distance(duckdb_connection):
- """Test NEAREST with signed=true for directional distance.
-
- Given:
- Intervals in A with an upstream neighbor in B
- When:
- NEAREST operator is applied with signed=true
- Then:
- Distance is negative for upstream B intervals (B ends before A starts)
- This matches bedtools closest -D ref behavior
- """
- # Arrange: a1 has an upstream neighbor (b1)
- # a1 at [300-400], b1 at [100-200] (upstream, distance = -(300-200+1) = -101)
- intervals_a = [
- GenomicInterval("chr1", 300, 400, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream of a1
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation with signed distance (-D ref)
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- signed=True,
- )
-
- # Act: Execute GIQL query with signed=true
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT
- a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand,
- b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand,
- distance
- FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Both should return 1 row
- assert len(giql_result) == len(bedtools_result) == 1
-
- giql_distance = giql_result[0][12]
- bedtools_distance = bedtools_result[0][12]
-
- # Verify the distance is negative (upstream)
- assert giql_distance < 0, f"Expected negative distance, got {giql_distance}"
- assert bedtools_distance < 0, (
- f"Expected negative bedtools distance, got {bedtools_distance}"
- )
-
- # Verify distances match
- assert giql_distance == bedtools_distance, (
- f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}"
- )
-
-
-def test_nearest_signed_distance_upstream_only(duckdb_connection):
- """Test NEAREST with signed=true filtering for upstream features only.
-
- Given:
- Intervals in A with neighbors in B, using signed=true
- When:
- Filtering for negative distance (upstream features)
- Then:
- Only upstream B intervals are returned (distance < 0)
- """
- # Arrange
- # a1 at [500-600]
- # b1 at [100-200]: upstream, distance = -(500 - 200 + 1) = -301 (closed interval +1)
- # b2 at [300-400]: upstream, distance = -(500 - 400 + 1) = -101 (closed interval +1)
- # b3 at [700-800]: downstream, distance = +(700 - 600 + 1) = +101 (closed interval +1)
- intervals_a = [
- GenomicInterval("chr1", 500, 600, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream
- GenomicInterval("chr1", 300, 400, "b2", 150, "+"), # Upstream
- GenomicInterval("chr1", 700, 800, "b3", 200, "+"), # Downstream
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query filtering for upstream only (negative distance)
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT
- a.name AS a_name,
- b.name AS b_name,
- distance
- FROM intervals_a a, NEAREST(intervals_b, k=3, signed=true) b
- WHERE distance < 0
- ORDER BY distance DESC
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Should only return upstream intervals (b1 and b2)
- assert len(giql_result) == 2
- # All distances should be negative
- for row in giql_result:
- assert row[2] < 0, f"Expected negative distance, got {row[2]}"
- # b2 should be first (closer upstream, distance -101 with closed interval +1)
- assert giql_result[0][1] == "b2"
- assert giql_result[0][2] == -101
- # b1 should be second (farther upstream, distance -301 with closed interval +1)
- assert giql_result[1][1] == "b1"
- assert giql_result[1][2] == -301
-
-
-def test_nearest_signed_distance_downstream(duckdb_connection):
- """Test NEAREST with signed=true for downstream features.
-
- Given:
- Intervals in A with a downstream neighbor in B
- When:
- NEAREST operator is applied with signed=true
- Then:
- Distance is positive for downstream B intervals (B starts after A ends)
- This matches bedtools closest -D ref behavior
- """
- # Arrange: a1 has a downstream neighbor (b1)
- # a1 at [100-200], b1 at [300-400] (downstream, distance = 300-200+1 = 101)
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 300, 400, "b1", 100, "+"), # Downstream of a1
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation with signed distance (-D ref)
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- signed=True,
- )
-
- # Act: Execute GIQL query with signed=true
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT
- a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand,
- b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand,
- distance
- FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Both should return 1 row
- assert len(giql_result) == len(bedtools_result) == 1
-
- giql_distance = giql_result[0][12]
- bedtools_distance = bedtools_result[0][12]
-
- # Verify the distance is positive (downstream)
- assert giql_distance > 0, f"Expected positive distance, got {giql_distance}"
- assert bedtools_distance > 0, "Expected positive bedtools distance"
-
- # Verify distances match
- assert giql_distance == bedtools_distance, (
- f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}"
- )
diff --git a/tests/integration/bedtools/test_strand_aware.py b/tests/integration/bedtools/test_strand_aware.py
deleted file mode 100644
index 11075c6..0000000
--- a/tests/integration/bedtools/test_strand_aware.py
+++ /dev/null
@@ -1,471 +0,0 @@
-"""Integration tests for GIQL strand-aware operations.
-
-These tests validate that GIQL correctly handles strand-specific interval
-operations, matching bedtools behavior with -s and -S flags.
-"""
-
-from giql import GIQLEngine
-
-from .utils.bed_export import load_intervals
-from .utils.bedtools_wrapper import closest
-from .utils.bedtools_wrapper import intersect
-from .utils.bedtools_wrapper import merge
-from .utils.comparison import compare_results
-from .utils.data_models import GenomicInterval
-
-
-def _setup_giql_engine(duckdb_connection):
- """Helper to set up GIQL engine with table schemas."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=False)
- engine.conn = duckdb_connection
-
- schema = {
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- "score": "BIGINT",
- "strand": "VARCHAR",
- }
-
- for table_name in ["intervals_a", "intervals_b", "intervals"]:
- engine.register_table_schema(
- table_name,
- schema,
- genomic_column="interval",
- interval_type="closed", # Match bedtools distance calculation
- )
-
- return engine
-
-
-def test_intersect_same_strand(duckdb_connection):
- """Test INTERSECTS with same-strand requirement.
-
- Given:
- Intervals on both same and opposite strands
- When:
- INTERSECTS with same-strand requirement is applied
- Then:
- Only same-strand overlaps are reported
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr1", 300, 400, "a2", 150, "-"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Overlaps a1 (same +)
- GenomicInterval("chr1", 350, 450, "b2", 150, "-"), # Overlaps a2 (same -)
- GenomicInterval("chr1", 150, 250, "b3", 200, "-"), # Overlaps a1 (opposite)
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools with same-strand requirement
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- strand_mode="same",
- )
-
- # Act: Execute GIQL query with same-strand filter
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand = b.strand
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_opposite_strand(duckdb_connection):
- """Test INTERSECTS with opposite-strand requirement.
-
- Given:
- Intervals on both same and opposite strands
- When:
- INTERSECTS with opposite-strand requirement is applied
- Then:
- Only opposite-strand overlaps are reported
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr1", 300, 400, "a2", 150, "-"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 150, 250, "b1", 100, "-"), # Overlaps a1 (opposite)
- GenomicInterval("chr1", 350, 450, "b2", 150, "+"), # Overlaps a2 (opposite)
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools with opposite-strand requirement
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- strand_mode="opposite",
- )
-
- # Act: Execute GIQL query with opposite-strand filter
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand != b.strand
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_ignore_strand(duckdb_connection):
- """Test INTERSECTS ignoring strand information.
-
- Given:
- Intervals with various strand combinations
- When:
- INTERSECTS without strand requirements is applied
- Then:
- All overlaps are reported regardless of strand
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Same strand
- GenomicInterval("chr1", 150, 250, "b2", 150, "-"), # Opposite strand
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools without strand requirements
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query without strand filter
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_intersect_mixed_strands(duckdb_connection):
- """Test INTERSECTS with mixed strand scenarios.
-
- Given:
- Complex scenario with +, -, and unstranded intervals
- When:
- INTERSECTS with same-strand requirement is applied
- Then:
- Results correctly handle strand matching logic
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- GenomicInterval("chr1", 300, 400, "a2", 150, "-"),
- GenomicInterval("chr1", 500, 600, "a3", 200, "."), # Unstranded
- ]
- intervals_b = [
- GenomicInterval("chr1", 150, 250, "b1", 100, "+"),
- GenomicInterval("chr1", 350, 450, "b2", 150, "-"),
- GenomicInterval("chr1", 550, 650, "b3", 200, "."),
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools with same-strand requirement
- bedtools_result = intersect(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- strand_mode="same",
- )
-
- # Act: Execute GIQL query with same-strand filter
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT DISTINCT a.*
- FROM intervals_a a, intervals_b b
- WHERE a.interval INTERSECTS b.interval
- AND a.strand = b.strand
- AND a.strand != '.'
- AND b.strand != '.'
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_nearest_same_strand(duckdb_connection):
- """Test NEAREST with same-strand requirement.
-
- Given:
- Intervals with candidates on same and opposite strands
- When:
- NEAREST with same-strand requirement is applied
- Then:
- Only same-strand nearest intervals are reported
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 250, 300, "b1", 100, "+"), # Nearest on same strand
- GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer but opposite
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools with same-strand requirement
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- strand_mode="same",
- )
-
- # Act: Execute GIQL query with same-strand NEAREST
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1, stranded=true) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_nearest_opposite_strand(duckdb_connection):
- """Test NEAREST with opposite-strand requirement.
-
- Given:
- Intervals with candidates on same and opposite strands
- When:
- NEAREST with opposite-strand requirement is applied
- Then:
- Only opposite-strand nearest intervals are reported
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 250, 300, "b1", 100, "-"), # Nearest opposite strand
- GenomicInterval("chr1", 220, 240, "b2", 150, "+"), # Closer but same strand
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools with opposite-strand requirement
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- strand_mode="opposite",
- )
-
- # Note: GIQL may not have direct opposite-strand support
- # This test documents the expected behavior
- assert len(bedtools_result) == 1
- assert bedtools_result[0][3] == "a1"
- assert bedtools_result[0][9] == "b1"
-
-
-def test_nearest_ignore_strand(duckdb_connection):
- """Test NEAREST ignoring strand information.
-
- Given:
- Intervals on different strands
- When:
- NEAREST without strand requirements is applied
- Then:
- Closest interval is found regardless of strand
- """
- # Arrange
- intervals_a = [
- GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
- ]
- intervals_b = [
- GenomicInterval("chr1", 250, 300, "b1", 100, "+"),
- GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals_a",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- )
- load_intervals(
- duckdb_connection,
- "intervals_b",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute bedtools operation using pybedtools without strand requirements
- bedtools_result = closest(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
- )
-
- # Act: Execute GIQL query without strand filter
- engine = _setup_giql_engine(duckdb_connection)
- giql_query = """
- SELECT a.*, b.*
- FROM intervals_a a, NEAREST(intervals_b, k=1) b
- ORDER BY a.chromosome, a.start_pos
- """
- sql = engine.transpile(giql_query)
- giql_result = duckdb_connection.execute(sql).fetchall()
-
- # Assert: Compare GIQL and bedtools results
- comparison = compare_results(giql_result, bedtools_result)
- assert comparison.match, (
- f"GIQL results don't match bedtools:\n"
- f"Differences: {comparison.differences}\n"
- f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
- )
-
-
-def test_merge_strand_specific(duckdb_connection):
- """Test MERGE with strand-specific behavior.
-
- Given:
- Overlapping intervals on different strands
- When:
- MERGE with strand-specific flag is applied
- Then:
- Intervals are merged per-strand (same-strand intervals merge together)
- """
- # Arrange - overlapping intervals on both strands
- intervals = [
- GenomicInterval("chr1", 100, 200, "i1", 100, "+"),
- GenomicInterval("chr1", 150, 250, "i2", 150, "+"), # Overlaps i1 (same +)
- GenomicInterval("chr1", 120, 180, "i3", 200, "-"), # Overlaps i1 (opposite)
- GenomicInterval("chr1", 160, 240, "i4", 100, "-"), # Overlaps i2 (opposite)
- ]
-
- # Load into DuckDB
- load_intervals(
- duckdb_connection,
- "intervals",
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- )
-
- # Act: Execute bedtools operation using pybedtools with strand-specific merging
- bedtools_result = merge(
- [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals],
- strand_mode="same",
- )
-
- # Note: GIQL MERGE with strand grouping would require GROUP BY strand
- # This test documents the expected behavior
- assert len(bedtools_result) >= 2 # At least one per strand
diff --git a/tests/integration/bedtools/utils/__init__.py b/tests/integration/bedtools/utils/__init__.py
deleted file mode 100644
index 99a414e..0000000
--- a/tests/integration/bedtools/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Utilities for bedtools integration testing."""
diff --git a/tests/integration/bedtools/utils/bed_export.py b/tests/integration/bedtools/utils/bed_export.py
deleted file mode 100644
index cd5a5c8..0000000
--- a/tests/integration/bedtools/utils/bed_export.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""DuckDB loading utilities for genomic intervals.
-
-This module provides functions for loading genomic intervals into DuckDB tables.
-"""
-
-from typing import List
-from typing import Tuple
-
-
-def load_intervals(
- conn,
- table_name: str,
- intervals: List[Tuple[str, int, int, str | None, int | None, str | None]],
-):
- """Load intervals into DuckDB table.
-
- Args:
- conn: DuckDB connection
- table_name: Name of table to create
- intervals: List of (chrom, start, end, name, score, strand) tuples
- where name, score, and strand can be None
-
- Note:
- Creates a new table with GIQL's default column names for genomic data:
- chromosome, start_pos, end_pos, name, score, strand
- """
- # Create table with GIQL's default column names
- conn.execute(f"""
- CREATE TABLE {table_name} (
- chromosome VARCHAR,
- start_pos INTEGER,
- end_pos INTEGER,
- name VARCHAR,
- score INTEGER,
- strand VARCHAR
- )
- """)
-
- # Insert intervals
- conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", intervals)
diff --git a/tests/integration/bedtools/utils/bedtools_wrapper.py b/tests/integration/bedtools/utils/bedtools_wrapper.py
deleted file mode 100644
index 21d201f..0000000
--- a/tests/integration/bedtools/utils/bedtools_wrapper.py
+++ /dev/null
@@ -1,303 +0,0 @@
-"""Pybedtools wrapper for genomic interval operations.
-
-This module provides functions for:
-- Creating BedTool objects from interval data
-- Executing bedtools operations via pybedtools
-- Converting results to comparable formats
-"""
-
-from typing import List
-from typing import Tuple
-
-import pybedtools
-
-# Strand flag constants for bedtools commands
-STRAND_SAME = True # Require same strand (pybedtools uses True for -s)
-# Require opposite strands (pybedtools uses "opposite" for -S)
-STRAND_OPPOSITE = "opposite"
-
-
-class BedtoolsError(Exception):
- """Raised when bedtools operation fails."""
-
- pass
-
-
-def create_bedtool(intervals: List[Tuple]) -> pybedtools.BedTool:
- """Create BedTool object from interval tuples.
-
- Args:
- intervals: List of tuples, each containing:
- - (chrom, start, end) for BED3 format
- - (chrom, start, end, name, score, strand) for BED6 format
-
- Returns:
- pybedtools.BedTool object
-
- Example:
- >>> intervals = [("chr1", 100, 200, "a1", 100, "+")]
- >>> bt = create_bedtool(intervals)
- """
- # Convert tuples to BED format strings
- bed_strings = []
- for interval in intervals:
- if len(interval) == 3:
- # BED3 format
- bed_strings.append(f"{interval[0]}\t{interval[1]}\t{interval[2]}")
- elif len(interval) >= 6:
- # BED6 format
- chrom, start, end, name, score, strand = interval[:6]
- # Handle None values
- name = name if name is not None else "."
- score = score if score is not None else 0
- strand = strand if strand is not None else "."
- bed_strings.append(f"{chrom}\t{start}\t{end}\t{name}\t{score}\t{strand}")
- else:
- raise ValueError(f"Invalid interval format: {interval}")
-
- bed_string = "\n".join(bed_strings)
- return pybedtools.BedTool(bed_string, from_string=True)
-
-
-def intersect(
- intervals_a: List[Tuple],
- intervals_b: List[Tuple],
- strand_mode: str | None = None,
-) -> List[Tuple]:
- """Find overlapping intervals using bedtools intersect.
-
- Args:
- intervals_a: First set of intervals
- intervals_b: Second set of intervals
- strand_mode: Strand requirement ('same', 'opposite', or None for ignore)
-
- Returns:
- List of tuples matching intervals_a format
-
- Example:
- >>> a = [("chr1", 100, 200, "a1", 100, "+")]
- >>> b = [("chr1", 150, 250, "b1", 100, "+")]
- >>> result = intersect(a, b)
- """
- try:
- bt_a = create_bedtool(intervals_a)
- bt_b = create_bedtool(intervals_b)
-
- # Build kwargs for intersect
- # Use -u (unique) to return each interval from A only once
- # This matches GIQL's DISTINCT behavior
- kwargs = {"u": True}
- if strand_mode == "same":
- kwargs["s"] = True
- elif strand_mode == "opposite":
- kwargs["S"] = True
-
- # Perform intersection
- result = bt_a.intersect(bt_b, **kwargs)
-
- # Convert to tuples
- return bedtool_to_tuples(result)
-
- except Exception as e:
- raise BedtoolsError(f"Intersect operation failed: {e}")
-
-
-def merge(intervals: List[Tuple], strand_mode: str | None = None) -> List[Tuple]:
- """Merge overlapping intervals using bedtools merge.
-
- Args:
- intervals: List of intervals to merge
- strand_mode: Strand requirement ('same' to merge per-strand, None for ignore)
-
- Returns:
- List of tuples in BED3 format (chrom, start, end)
-
- Example:
- >>> intervals = [
- ... ("chr1", 100, 200, "a1", 100, "+"),
- ... ("chr1", 180, 300, "a2", 100, "+"),
- ... ]
- >>> result = merge(intervals)
- >>> # Returns: [("chr1", 100, 300)]
- """
- try:
- bt = create_bedtool(intervals)
-
- # Sort before merging (required by bedtools merge)
- bt_sorted = bt.sort()
-
- # Build kwargs for merge
- kwargs = {}
- if strand_mode == "same":
- kwargs["s"] = True
-
- # Perform merge
- result = bt_sorted.merge(**kwargs)
-
- # Convert to tuples (merge returns BED3 format)
- return bedtool_to_tuples(result, format="bed3")
-
- except Exception as e:
- raise BedtoolsError(f"Merge operation failed: {e}")
-
-
-def closest(
- intervals_a: List[Tuple],
- intervals_b: List[Tuple],
- strand_mode: str | None = None,
- k: int = 1,
- signed: bool = False,
-) -> List[Tuple]:
- """Find closest intervals using bedtools closest.
-
- Args:
- intervals_a: Query intervals
- intervals_b: Database intervals to search
- strand_mode: Strand requirement ('same', 'opposite', or None for ignore)
- k: Number of closest intervals to report (default: 1)
- signed: If True, return signed distances (negative for upstream B,
- positive for downstream B). Uses bedtools -D ref mode.
-
- Returns:
- List of tuples with format: (a_fields..., b_fields..., distance)
-
- Example:
- >>> a = [("chr1", 100, 200, "a1", 100, "+")]
- >>> b = [("chr1", 300, 400, "b1", 100, "+")]
- >>> result = closest(a, b)
- >>> # Returns intervals from a and b with distance
- """
- try:
- bt_a = create_bedtool(intervals_a)
- bt_b = create_bedtool(intervals_b)
-
- # Sort inputs (required for -t flag)
- bt_a = bt_a.sort()
- bt_b = bt_b.sort()
-
- # Build kwargs for closest
- # -d reports unsigned distance, -D ref reports signed distance
- if signed:
- # Use -D ref for signed distance relative to reference (A)
- # Negative = B is upstream of A, Positive = B is downstream of A
- kwargs = {"D": "ref", "t": "first"}
- else:
- kwargs = {"d": True, "t": "first"}
-
- if k > 1:
- kwargs["k"] = k
- if strand_mode == "same":
- kwargs["s"] = True
- elif strand_mode == "opposite":
- kwargs["S"] = True
-
- # Perform closest
- result = bt_a.closest(bt_b, **kwargs)
-
- # Convert to tuples (closest returns concatenated fields + distance)
- return bedtool_to_tuples(result, format="closest")
-
- except Exception as e:
- raise BedtoolsError(f"Closest operation failed: {e}")
-
-
-def bedtool_to_tuples(bedtool: pybedtools.BedTool, format: str = "bed6") -> List[Tuple]:
- """Convert BedTool object to list of tuples.
-
- Args:
- bedtool: pybedtools.BedTool object
- format: Expected format ('bed3', 'bed6', or 'closest')
-
- Returns:
- List of tuples matching the format
-
- Note:
- - bed3: (chrom, start, end)
- - bed6: (chrom, start, end, name, score, strand)
- - closest: (chrom_a, start_a, end_a, name_a, score_a, strand_a,
- chrom_b, start_b, end_b, name_b, score_b, strand_b, distance)
- """
- rows = []
-
- for interval in bedtool:
- fields = interval.fields
-
- if format == "bed3":
- chrom = fields[0]
- start = int(fields[1])
- end = int(fields[2])
- rows.append((chrom, start, end))
-
- elif format == "bed6":
- if len(fields) < 6:
- # Pad with defaults if needed
- while len(fields) < 6:
- if len(fields) == 3:
- fields.append(".") # name
- elif len(fields) == 4:
- fields.append("0") # score
- elif len(fields) == 5:
- fields.append(".") # strand
-
- chrom = fields[0]
- start = int(fields[1])
- end = int(fields[2])
- name = fields[3] if fields[3] != "." else None
- score = int(fields[4]) if fields[4] != "." else None
- strand = fields[5] if fields[5] != "." else None
-
- rows.append((chrom, start, end, name, score, strand))
-
- elif format == "closest":
- # Closest returns: a_fields + b_fields + distance
- # For BED6: 6 fields for a, 6 fields for b, 1 distance = 13 total
- if len(fields) >= 13:
- # Parse all fields as-is, converting appropriate ones to int
- row = []
- for i, field in enumerate(fields):
- # Positions (1, 2, 7, 8) and distance (12) should be int
- if i in (1, 2, 7, 8, 12):
- row.append(int(field))
- # Scores (4, 10) should be int if not "."
- elif i in (4, 10):
- row.append(int(field) if field != "." else None)
- # Names (3, 9) and strands (5, 11) should be None if "."
- elif i in (3, 5, 9, 11):
- row.append(field if field != "." else None)
- else:
- row.append(field)
- rows.append(tuple(row))
- else:
- raise ValueError(
- f"Unexpected number of fields for closest: {len(fields)}"
- )
-
- else:
- raise ValueError(f"Unsupported format: {format}")
-
- return rows
-
-
-def add_strand_flag(kwargs: dict, strand_mode: str | None) -> dict:
- """Add strand flag to bedtools kwargs.
-
- Args:
- kwargs: Base kwargs dictionary
- strand_mode: Strand requirement ('same', 'opposite', or None for ignore)
-
- Returns:
- Updated kwargs dictionary with strand flag
-
- Example:
- >>> kwargs = add_strand_flag({}, "same")
- >>> # Returns: {"s": True}
- """
- updated_kwargs = kwargs.copy()
-
- if strand_mode == "same":
- updated_kwargs["s"] = True
- elif strand_mode == "opposite":
- updated_kwargs["S"] = True
- # None or other values = ignore strand (no flag added)
-
- return updated_kwargs
diff --git a/tests/integration/bedtools/utils/comparison.py b/tests/integration/bedtools/utils/comparison.py
deleted file mode 100644
index caa4bd2..0000000
--- a/tests/integration/bedtools/utils/comparison.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Result comparison logic for GIQL vs bedtools outputs.
-
-This module provides functions for:
-- Comparing GIQL and bedtools results with appropriate tolerance
-- Order-independent row sorting
-- Epsilon-based float comparison
-"""
-
-from typing import Any
-from typing import List
-from typing import Tuple
-
-from .data_models import ComparisonResult
-
-
-def _sort_key(row: Tuple) -> Tuple:
- """Generate sort key for order-independent comparison.
-
- Args:
- row: Result row tuple
-
- Returns:
- Sortable tuple (handles None values)
- """
- # Convert None to empty string for sorting
- return tuple("" if v is None else v for v in row)
-
-
-def _values_match(val1: Any, val2: Any, epsilon: float = 1e-9) -> bool:
- """Compare two values with appropriate tolerance.
-
- Args:
- val1: First value
- val2: Second value
- epsilon: Tolerance for floating-point comparisons
-
- Returns:
- True if values match within tolerance
- """
- # Handle None values
- if val1 is None and val2 is None:
- return True
- if val1 is None or val2 is None:
- return False
-
- # Float comparison with epsilon
- if isinstance(val1, float) or isinstance(val2, float):
- try:
- return abs(float(val1) - float(val2)) <= epsilon
- except (ValueError, TypeError):
- return False
-
- # Exact match for other types
- return val1 == val2
-
-
-def compare_results(
- giql_rows: List[Tuple], bedtools_rows: List[Tuple], epsilon: float = 1e-9
-) -> ComparisonResult:
- """Compare GIQL and bedtools results with appropriate tolerance.
-
- Comparison rules:
- - Integer positions/counts: exact match required
- - Floating-point values: epsilon tolerance
- - Row ordering: order-independent (sorts both result sets)
-
- Args:
- giql_rows: Rows from GIQL query execution
- bedtools_rows: Rows from bedtools output
- epsilon: Tolerance for floating-point comparisons
-
- Returns:
- ComparisonResult with match status and differences
- """
- giql_count = len(giql_rows)
- bedtools_count = len(bedtools_rows)
-
- # Sort both result sets for order-independent comparison
- giql_sorted = sorted(giql_rows, key=_sort_key)
- bedtools_sorted = sorted(bedtools_rows, key=_sort_key)
-
- differences = []
-
- # Check row counts
- if giql_count != bedtools_count:
- differences.append(
- f"Row count mismatch: GIQL has {giql_count} rows, "
- f"bedtools has {bedtools_count} rows"
- )
-
- # Compare rows
- max_rows = max(giql_count, bedtools_count)
- for i in range(max_rows):
- # Check if row exists in both
- if i >= giql_count:
- differences.append(
- f"Row {i}: Missing in GIQL, present in bedtools: {bedtools_sorted[i]}"
- )
- continue
- if i >= bedtools_count:
- differences.append(
- f"Row {i}: Present in GIQL, missing in bedtools: {giql_sorted[i]}"
- )
- continue
-
- giql_row = giql_sorted[i]
- bedtools_row = bedtools_sorted[i]
-
- # Check column counts
- if len(giql_row) != len(bedtools_row):
- differences.append(
- f"Row {i}: Column count mismatch "
- f"(GIQL: {len(giql_row)} cols, bedtools: {len(bedtools_row)} cols)"
- )
- continue
-
- # Compare each column
- for col_idx, (giql_val, bedtools_val) in enumerate(zip(giql_row, bedtools_row)):
- if not _values_match(giql_val, bedtools_val, epsilon):
- differences.append(
- f"Row {i}, col {col_idx}: "
- f"GIQL={giql_val!r} != bedtools={bedtools_val!r}"
- )
-
- # Determine match status
- match = len(differences) == 0
-
- return ComparisonResult(
- match=match,
- giql_row_count=giql_count,
- bedtools_row_count=bedtools_count,
- differences=differences,
- comparison_metadata={"epsilon": epsilon, "sorted": True},
- )
diff --git a/tests/integration/bedtools/utils/data_models.py b/tests/integration/bedtools/utils/data_models.py
deleted file mode 100644
index dad0832..0000000
--- a/tests/integration/bedtools/utils/data_models.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""Data models for bedtools integration testing.
-
-This module defines the core data structures used throughout the test suite:
-- GenomicInterval: Represents a single genomic interval
-- SimulatedDataset: Collection of intervals for testing
-- ComparisonResult: Result of comparing GIQL vs bedtools outputs
-- IntervalGeneratorConfig: Configuration for dataset generation
-- BedtoolsVersion: Bedtools version information
-"""
-
-import re
-from dataclasses import dataclass
-from dataclasses import field
-from pathlib import Path
-from typing import Dict
-from typing import List
-
-
-@dataclass
-class GenomicInterval:
- """Represents a single genomic interval with all BED file fields.
-
- Attributes:
- chrom: Chromosome name (e.g., "chr1", "chr2", "chrX")
- start: Start position (0-based, inclusive)
- end: End position (0-based, exclusive)
- name: Optional interval name/identifier
- score: Optional score value (0-1000)
- strand: Optional strand ("+", "-", or ".")
- """
-
- chrom: str
- start: int
- end: int
- name: str | None = None
- score: int | None = None
- strand: str | None = None
-
- def __post_init__(self):
- """Validate interval fields."""
- if self.start >= self.end:
- raise ValueError(
- f"Invalid interval: start ({self.start}) >= end ({self.end})"
- )
- if self.start < 0:
- raise ValueError(f"Invalid interval: start ({self.start}) < 0")
- if self.strand and self.strand not in ["+", "-", "."]:
- raise ValueError(f"Invalid strand: {self.strand}")
- if self.score is not None and not (0 <= self.score <= 1000):
- raise ValueError(f"Invalid score: {self.score}")
-
- def to_bed_line(self, format="bed6") -> str:
- """Convert to BED format line.
-
- Args:
- format: Output format ('bed3' or 'bed6')
-
- Returns:
- Tab-separated BED format string
- """
- if format == "bed3":
- return f"{self.chrom}\t{self.start}\t{self.end}"
- elif format == "bed6":
- name = self.name or "."
- score = self.score if self.score is not None else 0
- strand = self.strand or "."
- return f"{self.chrom}\t{self.start}\t{self.end}\t{name}\t{score}\t{strand}"
- else:
- raise ValueError(f"Unsupported BED format: {format}")
-
-
-@dataclass
-class SimulatedDataset:
- """Collection of genomic intervals with controlled properties for testing.
-
- Attributes:
- name: Dataset identifier (e.g., "intervals_a", "intervals_b")
- intervals: List of genomic intervals
- scenario_type: Scenario descriptor (e.g., "overlapping", "adjacent")
- metadata: Generation parameters (seed, chromosome_count, etc.)
- """
-
- name: str
- intervals: List[GenomicInterval]
- scenario_type: str
- metadata: dict = field(default_factory=dict)
-
- def __post_init__(self):
- """Validate dataset has at least one interval."""
- if len(self.intervals) == 0:
- raise ValueError("Dataset must contain at least one interval")
-
- def to_bed_file(self, path: Path, format="bed6"):
- """Export to BED file.
-
- Args:
- path: Output file path
- format: BED format ('bed3' or 'bed6')
- """
- with open(path, "w") as f:
- for interval in self.intervals:
- f.write(interval.to_bed_line(format) + "\n")
-
- def to_duckdb_table(self, conn, table_name: str):
- """Load into DuckDB table.
-
- Args:
- conn: DuckDB connection
- table_name: Name of table to create
- """
- rows = [
- (i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in self.intervals
- ]
- conn.execute(f"""
- CREATE TABLE {table_name} (
- chrom VARCHAR,
- start INTEGER,
- end INTEGER,
- name VARCHAR,
- score INTEGER,
- strand VARCHAR
- )
- """)
- conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", rows)
-
-
-@dataclass
-class ComparisonResult:
- """Result of comparing GIQL and bedtools outputs.
-
- Attributes:
- match: Whether results match
- giql_row_count: Number of rows from GIQL query
- bedtools_row_count: Number of rows from bedtools output
- differences: Specific differences found (if match=False)
- comparison_metadata: Epsilon used, sort order, etc.
- """
-
- match: bool
- giql_row_count: int
- bedtools_row_count: int
- differences: List[str] = field(default_factory=list)
- comparison_metadata: dict = field(default_factory=dict)
-
- def __bool__(self) -> bool:
- """Allow direct boolean evaluation in assertions."""
- return self.match
-
- def failure_message(self) -> str:
- """Generate detailed failure message for test output.
-
- Returns:
- Formatted failure message with differences
- """
- if self.match:
- return "✓ Results match"
-
- msg = [
- f"✗ Results do not match",
- f" GIQL rows: {self.giql_row_count}",
- f" Bedtools rows: {self.bedtools_row_count}",
- ]
-
- if self.differences:
- msg.append(" Differences:")
- for diff in self.differences[:10]: # Limit to first 10
- msg.append(f" - {diff}")
- if len(self.differences) > 10:
- msg.append(f" ... and {len(self.differences) - 10} more")
-
- return "\n".join(msg)
-
-
-@dataclass
-class IntervalGeneratorConfig:
- """Configuration for simulated dataset generation.
-
- Attributes:
- chromosome_count: Number of chromosomes to generate
- intervals_per_chromosome: Intervals per chromosome
- min_interval_size: Minimum interval length
- max_interval_size: Maximum interval length
- overlap_probability: Probability of overlap (0.0-1.0)
- strand_distribution: Proportions of +/-/. strands
- seed: Random seed for reproducibility
- """
-
- chromosome_count: int = 3
- intervals_per_chromosome: int = 100
- min_interval_size: int = 100
- max_interval_size: int = 1000
- overlap_probability: float = 0.3
- strand_distribution: dict = field(
- default_factory=lambda: {"+": 0.45, "-": 0.45, ".": 0.1}
- )
- seed: int = 42
-
- def __post_init__(self):
- """Validate configuration parameters."""
- if self.chromosome_count <= 0:
- raise ValueError("chromosome_count must be > 0")
- if self.intervals_per_chromosome <= 0:
- raise ValueError("intervals_per_chromosome must be > 0")
- if self.min_interval_size < 1:
- raise ValueError("min_interval_size must be >= 1")
- if self.max_interval_size < self.min_interval_size:
- raise ValueError("max_interval_size must be >= min_interval_size")
- if not (0.0 <= self.overlap_probability <= 1.0):
- raise ValueError("overlap_probability must be in [0.0, 1.0]")
- if abs(sum(self.strand_distribution.values()) - 1.0) > 1e-6:
- raise ValueError("strand_distribution must sum to 1.0")
-
-
-@dataclass
-class BedtoolsVersion:
- """Represents bedtools version information.
-
- Attributes:
- major: Major version number
- minor: Minor version number
- patch: Patch version number
- raw_version_string: Original version string from bedtools
- """
-
- major: int
- minor: int
- patch: int
- raw_version_string: str
-
- def is_compatible(self) -> bool:
- """Check if version meets minimum requirement (2.30.0).
-
- Returns:
- True if version >= 2.30.0
- """
- return (self.major, self.minor, self.patch) >= (2, 30, 0)
-
- def __str__(self) -> str:
- """Return version as string."""
- return f"{self.major}.{self.minor}.{self.patch}"
-
- @classmethod
- def from_string(cls, version_str: str) -> "BedtoolsVersion":
- """Parse version from bedtools --version output.
-
- Args:
- version_str: Version string from bedtools (e.g., "bedtools v2.30.0")
-
- Returns:
- BedtoolsVersion instance
-
- Raises:
- ValueError: If version string cannot be parsed
- """
- match = re.search(r"v?(\d+)\.(\d+)\.(\d+)", version_str)
- if not match:
- raise ValueError(f"Could not parse version from: {version_str}")
- major, minor, patch = map(int, match.groups())
- return cls(major, minor, patch, version_str)
diff --git a/tests/integration/bedtools/utils/interval_generator.py b/tests/integration/bedtools/utils/interval_generator.py
deleted file mode 100644
index 05df214..0000000
--- a/tests/integration/bedtools/utils/interval_generator.py
+++ /dev/null
@@ -1,425 +0,0 @@
-"""Interval generator for creating simulated genomic datasets.
-
-This module provides the IntervalGenerator class for creating test datasets
-with controlled properties (overlap density, strand distribution, etc.).
-"""
-
-import random
-from typing import List
-from typing import Tuple
-
-from .data_models import GenomicInterval
-from .data_models import IntervalGeneratorConfig
-from .data_models import SimulatedDataset
-
-
-class IntervalGenerator:
- """Generate simulated genomic intervals for testing.
-
- Provides methods for generating intervals with various patterns:
- - Overlapping intervals
- - Adjacent intervals
- - Separated intervals
- - Multi-chromosome datasets
- - Strand-specific datasets
- """
-
- def __init__(self, config: IntervalGeneratorConfig | None = None):
- """Initialize interval generator.
-
- Args:
- config: Generator configuration (uses defaults if None)
- """
- self.config = config or IntervalGeneratorConfig()
- self.rng = random.Random(self.config.seed)
-
- def _choose_strand(self) -> str:
- """Choose strand based on configured distribution.
-
- Returns:
- Strand ('+', '-', or '.')
- """
- r = self.rng.random()
- cumulative = 0.0
- for strand, prob in self.config.strand_distribution.items():
- cumulative += prob
- if r <= cumulative:
- return strand
- return "." # Fallback
-
- def _generate_interval_size(self) -> int:
- """Generate random interval size within configured range.
-
- Returns:
- Interval size in base pairs
- """
- return self.rng.randint(
- self.config.min_interval_size, self.config.max_interval_size
- )
-
- def generate_basic(
- self, chromosome: str, count: int, max_position: int = 1000000
- ) -> List[GenomicInterval]:
- """Generate basic random intervals.
-
- Args:
- chromosome: Chromosome name
- count: Number of intervals to generate
- max_position: Maximum chromosome position
-
- Returns:
- List of genomic intervals
- """
- intervals = []
- for i in range(count):
- size = self._generate_interval_size()
- start = self.rng.randint(0, max_position - size)
- end = start + size
- strand = self._choose_strand()
-
- intervals.append(
- GenomicInterval(
- chrom=chromosome,
- start=start,
- end=end,
- name=f"interval_{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- return intervals
-
- def generate_dataset(
- self,
- name: str,
- scenario_type: str = "basic",
- chromosome_count: int | None = None,
- intervals_per_chrom: int | None = None,
- ) -> SimulatedDataset:
- """Generate a complete simulated dataset.
-
- Args:
- name: Dataset identifier
- scenario_type: Type of scenario ("basic", "overlapping", etc.)
- chromosome_count: Number of chromosomes (uses config default if None)
- intervals_per_chrom: Intervals per chromosome (uses config default if None)
-
- Returns:
- SimulatedDataset with generated intervals
- """
- chrom_count = chromosome_count or self.config.chromosome_count
- interval_count = intervals_per_chrom or self.config.intervals_per_chromosome
-
- all_intervals = []
- for i in range(chrom_count):
- chrom_name = f"chr{i + 1}"
- intervals = self.generate_basic(chrom_name, interval_count)
- all_intervals.extend(intervals)
-
- return SimulatedDataset(
- name=name,
- intervals=all_intervals,
- scenario_type=scenario_type,
- metadata={
- "chromosome_count": chrom_count,
- "intervals_per_chromosome": interval_count,
- "seed": self.config.seed,
- "total_intervals": len(all_intervals),
- },
- )
-
- def generate_overlapping_scenarios(
- self, chromosome: str, count: int, overlap_size: int = 50
- ) -> List[GenomicInterval]:
- """Generate overlapping intervals with controlled overlap.
-
- Args:
- chromosome: Chromosome name
- count: Number of intervals to generate
- overlap_size: Size of overlap between adjacent intervals
-
- Returns:
- List of overlapping genomic intervals
- """
- intervals = []
- base_size = self.config.min_interval_size
- current_start = 100
-
- for i in range(count):
- start = current_start
- end = start + base_size
- strand = self._choose_strand()
-
- intervals.append(
- GenomicInterval(
- chrom=chromosome,
- start=start,
- end=end,
- name=f"overlap_{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- # Next interval starts before current ends (creating overlap)
- current_start = end - overlap_size
-
- return intervals
-
- def generate_adjacent_scenarios(
- self, chromosome: str, count: int
- ) -> List[GenomicInterval]:
- """Generate adjacent intervals (touching but not overlapping).
-
- Args:
- chromosome: Chromosome name
- count: Number of intervals to generate
-
- Returns:
- List of adjacent genomic intervals
- """
- intervals = []
- base_size = self.config.min_interval_size
- current_start = 100
-
- for i in range(count):
- start = current_start
- end = start + base_size
- strand = self._choose_strand()
-
- intervals.append(
- GenomicInterval(
- chrom=chromosome,
- start=start,
- end=end,
- name=f"adjacent_{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- # Next interval starts exactly where current ends
- current_start = end
-
- return intervals
-
- def generate_separated_scenarios(
- self, chromosome: str, count: int, gap_size: int = 100
- ) -> List[GenomicInterval]:
- """Generate separated intervals with gaps between them.
-
- Args:
- chromosome: Chromosome name
- count: Number of intervals to generate
- gap_size: Size of gap between intervals
-
- Returns:
- List of separated genomic intervals
- """
- intervals = []
- base_size = self.config.min_interval_size
- current_start = 100
-
- for i in range(count):
- start = current_start
- end = start + base_size
- strand = self._choose_strand()
-
- intervals.append(
- GenomicInterval(
- chrom=chromosome,
- start=start,
- end=end,
- name=f"separated_{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- # Next interval starts after a gap
- current_start = end + gap_size
-
- return intervals
-
- def generate_multi_chromosome_scenarios(
- self,
- chromosome_count: int,
- intervals_per_chrom: int,
- scenario_func: str = "basic",
- ) -> List[GenomicInterval]:
- """Generate intervals across multiple chromosomes.
-
- Args:
- chromosome_count: Number of chromosomes
- intervals_per_chrom: Number of intervals per chromosome
- scenario_func: Scenario type ("basic", "overlapping", "adjacent",
- "separated")
-
- Returns:
- List of genomic intervals across multiple chromosomes
- """
- all_intervals = []
-
- for i in range(chromosome_count):
- chrom_name = f"chr{i + 1}"
-
- if scenario_func == "overlapping":
- intervals = self.generate_overlapping_scenarios(
- chrom_name, intervals_per_chrom
- )
- elif scenario_func == "adjacent":
- intervals = self.generate_adjacent_scenarios(
- chrom_name, intervals_per_chrom
- )
- elif scenario_func == "separated":
- intervals = self.generate_separated_scenarios(
- chrom_name, intervals_per_chrom
- )
- else: # basic
- intervals = self.generate_basic(chrom_name, intervals_per_chrom)
-
- all_intervals.extend(intervals)
-
- return all_intervals
-
- def generate_same_strand_pairs(
- self, chromosome: str, pair_count: int, strand: str = "+"
- ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]:
- """Generate two sets of intervals on the same strand.
-
- Args:
- chromosome: Chromosome name
- pair_count: Number of interval pairs to generate
- strand: Strand to use for all intervals ('+' or '-')
-
- Returns:
- Tuple of (intervals_a, intervals_b) on same strand
- """
- intervals_a = []
- intervals_b = []
- base_size = self.config.min_interval_size
- current_start = 100
-
- for i in range(pair_count):
- # Interval A
- start_a = current_start
- end_a = start_a + base_size
- intervals_a.append(
- GenomicInterval(
- chrom=chromosome,
- start=start_a,
- end=end_a,
- name=f"a{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- # Interval B - overlaps A, same strand
- start_b = start_a + (base_size // 2)
- end_b = start_b + base_size
- intervals_b.append(
- GenomicInterval(
- chrom=chromosome,
- start=start_b,
- end=end_b,
- name=f"b{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- # Move to next region
- current_start = end_b + 100
-
- return intervals_a, intervals_b
-
- def generate_opposite_strand_pairs(
- self, chromosome: str, pair_count: int
- ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]:
- """Generate two sets of intervals on opposite strands.
-
- Args:
- chromosome: Chromosome name
- pair_count: Number of interval pairs to generate
-
- Returns:
- Tuple of (intervals_a, intervals_b) on opposite strands
- """
- intervals_a = []
- intervals_b = []
- base_size = self.config.min_interval_size
- current_start = 100
-
- for i in range(pair_count):
- # Interval A on + strand
- start_a = current_start
- end_a = start_a + base_size
- intervals_a.append(
- GenomicInterval(
- chrom=chromosome,
- start=start_a,
- end=end_a,
- name=f"a{i}",
- score=self.rng.randint(0, 1000),
- strand="+",
- )
- )
-
- # Interval B - overlaps A, opposite strand (-)
- start_b = start_a + (base_size // 2)
- end_b = start_b + base_size
- intervals_b.append(
- GenomicInterval(
- chrom=chromosome,
- start=start_b,
- end=end_b,
- name=f"b{i}",
- score=self.rng.randint(0, 1000),
- strand="-",
- )
- )
-
- # Move to next region
- current_start = end_b + 100
-
- return intervals_a, intervals_b
-
- def generate_mixed_strand_intervals(
- self, chromosome: str, count: int
- ) -> List[GenomicInterval]:
- """Generate intervals with mixed strand assignments.
-
- Args:
- chromosome: Chromosome name
- count: Number of intervals to generate
-
- Returns:
- List of intervals with randomly assigned strands (+, -, .)
- """
- intervals = []
- base_size = self.config.min_interval_size
- strands = ["+", "-", "."]
- current_start = 100
-
- for i in range(count):
- start = current_start
- end = start + base_size
- # Randomly choose strand from +, -, .
- strand = self.rng.choice(strands)
-
- intervals.append(
- GenomicInterval(
- chrom=chromosome,
- start=start,
- end=end,
- name=f"mixed_{i}",
- score=self.rng.randint(0, 1000),
- strand=strand,
- )
- )
-
- current_start = end + 50 # Small gap
-
- return intervals
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index c359608..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,441 +0,0 @@
-"""Tests for CLUSTER and MERGE operations."""
-
-import pytest
-
-from giql import GIQLEngine
-
-
-@pytest.fixture
-def cluster_test_data_csv(tmp_path):
- """Create sample data for cluster testing."""
- csv_content = """
- id,chromosome,start_pos,end_pos,name
- 1,chr1,100,200,f1
- 2,chr1,180,250,f2
- 3,chr1,250,500,f3
- 4,chr1,501,1000,f4
- 5,chr2,100,200,f5
- 6,chr2,300,400,f6
- """
- csv_path = tmp_path / "features.csv"
- csv_path.write_text(csv_content.strip())
- return str(csv_path)
-
-
-@pytest.fixture
-def stranded_test_data_csv(tmp_path):
- """Create stranded data for cluster testing."""
- csv_content = """
- id,chromosome,start_pos,end_pos,strand,name
- 1,chr1,100,200,+,f1
- 2,chr1,180,250,+,f2
- 3,chr1,200,300,-,f3
- 4,chr1,250,350,-,f4
- 5,chr1,400,500,+,f5
- """
- csv_path = tmp_path / "stranded_features.csv"
- csv_path.write_text(csv_content.strip())
- return str(csv_path)
-
-
-@pytest.fixture
-def duckdb_cluster_engine(cluster_test_data_csv):
- """DuckDB engine with cluster test data loaded."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=True)
- engine.load_csv("features", cluster_test_data_csv)
- engine.register_table_schema(
- "features",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- )
- yield engine
- engine.close()
-
-
-@pytest.fixture
-def duckdb_stranded_engine(stranded_test_data_csv):
- """DuckDB engine with stranded test data loaded."""
- engine = GIQLEngine(target_dialect="duckdb", verbose=True)
- engine.load_csv("stranded_features", stranded_test_data_csv)
- engine.register_table_schema(
- "stranded_features",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "BIGINT",
- "end_pos": "BIGINT",
- "strand": "VARCHAR",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- strand_col="strand",
- )
- yield engine
- engine.close()
-
-
-class TestCluster:
- """Tests for CLUSTER window function."""
-
- def test_basic_cluster(self, duckdb_cluster_engine, to_df):
- """Test basic CLUSTER operation."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- SELECT
- id,
- chromosome,
- start_pos,
- end_pos,
- name,
- CLUSTER(interval) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
- )
-
- # Expected clusters:
- # chr1: features 1,2,3 are cluster 1 (overlapping/bookended)
- # chr1: feature 4 is cluster 2 (gap at 501)
- # chr2: feature 5 is cluster 1
- # chr2: feature 6 is cluster 2 (gap at 300)
-
- assert len(result) == 6
-
- # Check chr1 clusters
- chr1_results = result[result["chromosome"] == "chr1"]
- assert chr1_results.iloc[0]["cluster_id"] == chr1_results.iloc[1]["cluster_id"]
- assert chr1_results.iloc[1]["cluster_id"] == chr1_results.iloc[2]["cluster_id"]
- assert chr1_results.iloc[2]["cluster_id"] != chr1_results.iloc[3]["cluster_id"]
-
- # Check chr2 clusters
- chr2_results = result[result["chromosome"] == "chr2"]
- assert chr2_results.iloc[0]["cluster_id"] != chr2_results.iloc[1]["cluster_id"]
-
- def test_cluster_with_distance(self, duckdb_cluster_engine, to_df):
- """Test CLUSTER with distance parameter."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- SELECT
- id,
- chromosome,
- start_pos,
- end_pos,
- name,
- CLUSTER(interval, 100) AS cluster_id
- FROM features
- ORDER BY chromosome, start_pos
- """)
- )
-
- # With distance=100, chr1 features 1,2,3,4 should all be in same cluster
- # (gap of 1bp at position 501 is within 100bp tolerance)
- chr1_results = result[result["chromosome"] == "chr1"]
- cluster_ids = chr1_results["cluster_id"].tolist()
- assert len(set(cluster_ids)) == 1 # All in same cluster
-
- def test_stranded_cluster(self, duckdb_stranded_engine, to_df):
- """Test CLUSTER with stranded=true."""
- result = to_df(
- duckdb_stranded_engine.execute("""
- SELECT
- id,
- chromosome,
- start_pos,
- end_pos,
- strand,
- name,
- CLUSTER(interval, stranded=true) AS cluster_id
- FROM stranded_features
- ORDER BY chromosome, start_pos
- """)
- )
-
- # Features should cluster only within the same strand:
- # + strand: f1,f2 overlap -> cluster 1, f5 is separate -> cluster 2
- # - strand: f3,f4 overlap -> cluster 1
- # Note: cluster_id numbering restarts for each partition (strand)
-
- assert len(result) == 5
-
- # Extract features
- f1 = result[result["id"] == 1].iloc[0]
- f2 = result[result["id"] == 2].iloc[0]
- f3 = result[result["id"] == 3].iloc[0]
- f4 = result[result["id"] == 4].iloc[0]
- f5 = result[result["id"] == 5].iloc[0]
-
- # Check that f1 and f2 (both +, overlapping) have same cluster_id
- assert f1["cluster_id"] == f2["cluster_id"]
- assert f1["strand"] == "+"
- assert f2["strand"] == "+"
-
- # Check that f3 and f4 (both -, overlapping) have same cluster_id
- assert f3["cluster_id"] == f4["cluster_id"]
- assert f3["strand"] == "-"
- assert f4["strand"] == "-"
-
- # Check that f5 (+ strand, separated) has different cluster from f1/f2
- assert f5["cluster_id"] != f1["cluster_id"]
- assert f5["strand"] == "+"
-
- # Verify stranded clustering works: compare with non-stranded
- result_nonstranded = to_df(
- duckdb_stranded_engine.execute("""
- SELECT
- id,
- CLUSTER(interval) AS cluster_id
- FROM stranded_features
- ORDER BY id
- """)
- )
-
- # Without stranded, f1-f4 should all be in same cluster (overlapping)
- ns_f1 = result_nonstranded[result_nonstranded["id"] == 1].iloc[0]
- ns_f2 = result_nonstranded[result_nonstranded["id"] == 2].iloc[0]
- ns_f3 = result_nonstranded[result_nonstranded["id"] == 3].iloc[0]
- ns_f4 = result_nonstranded[result_nonstranded["id"] == 4].iloc[0]
-
- assert ns_f1["cluster_id"] == ns_f2["cluster_id"]
- assert ns_f2["cluster_id"] == ns_f3["cluster_id"]
- assert ns_f3["cluster_id"] == ns_f4["cluster_id"]
-
- def test_cluster_in_cte(self, duckdb_cluster_engine, to_df):
- """Test CLUSTER operation inside a CTE."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- WITH clustered_features AS (
- SELECT
- id,
- chromosome,
- start_pos,
- end_pos,
- name,
- CLUSTER(interval) AS cluster_id
- FROM features
- )
- SELECT *
- FROM clustered_features
- WHERE cluster_id = 1
- ORDER BY chromosome, start_pos
- """)
- )
-
- # Should return features in cluster 1 from each chromosome
- assert len(result) > 0
- assert all("cluster_id" in row for _, row in result.iterrows())
-
- def test_cluster_in_cte_with_aggregation(self, duckdb_cluster_engine, to_df):
- """Test CLUSTER in CTE with aggregation in outer query."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- WITH clustered_features AS (
- SELECT
- chromosome,
- start_pos,
- end_pos,
- CLUSTER(interval) AS cluster_id
- FROM features
- )
- SELECT
- chromosome,
- cluster_id,
- COUNT(*) as interval_count,
- MIN(start_pos) as min_start,
- MAX(end_pos) as max_end
- FROM clustered_features
- GROUP BY chromosome, cluster_id
- ORDER BY chromosome, cluster_id
- """)
- )
-
- # chr1 should have 2 clusters, chr2 should have 2 clusters
- assert len(result) == 4
-
- chr1_results = result[result["chromosome"] == "chr1"]
- assert len(chr1_results) == 2
- # First cluster should have 3 intervals (f1, f2, f3)
- assert chr1_results.iloc[0]["interval_count"] == 3
- # Second cluster should have 1 interval (f4)
- assert chr1_results.iloc[1]["interval_count"] == 1
-
-
-class TestMerge:
- """Tests for MERGE aggregate function."""
-
- def test_basic_merge(self, duckdb_cluster_engine, to_df):
- """Test basic MERGE operation."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- SELECT MERGE(interval)
- FROM features
- """)
- )
-
- # Expected merged intervals:
- # chr1: features 1,2,3 merge into [100, 500]
- # chr1: feature 4 stays as [501, 1000]
- # chr2: feature 5 stays as [100, 200]
- # chr2: feature 6 stays as [300, 400]
-
- assert len(result) == 4
-
- # Check chr1 merged intervals
- chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos")
- assert len(chr1_results) == 2
- assert chr1_results.iloc[0]["start_pos"] == 100
- assert chr1_results.iloc[0]["end_pos"] == 500
- assert chr1_results.iloc[1]["start_pos"] == 501
- assert chr1_results.iloc[1]["end_pos"] == 1000
-
- # Check chr2 stays separate
- chr2_results = result[result["chromosome"] == "chr2"].sort_values("start_pos")
- assert len(chr2_results) == 2
- assert chr2_results.iloc[0]["start_pos"] == 100
- assert chr2_results.iloc[0]["end_pos"] == 200
- assert chr2_results.iloc[1]["start_pos"] == 300
- assert chr2_results.iloc[1]["end_pos"] == 400
-
- def test_merge_with_distance(self, duckdb_cluster_engine, to_df):
- """Test MERGE with distance parameter."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- SELECT MERGE(interval, 100)
- FROM features
- """)
- )
-
- # With distance=100, chr1 features 1-4 should merge into one interval
- chr1_results = result[result["chromosome"] == "chr1"]
- assert len(chr1_results) == 1
- assert chr1_results.iloc[0]["start_pos"] == 100
- assert chr1_results.iloc[0]["end_pos"] == 1000
-
- def test_merge_with_aggregation(self, duckdb_cluster_engine, to_df):
- """Test MERGE with additional aggregation columns."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- SELECT MERGE(interval), COUNT(*) as feature_count
- FROM features
- """)
- )
-
- # chr1 should have 2 merged intervals with counts
- chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos")
- assert len(chr1_results) == 2
- assert chr1_results.iloc[0]["feature_count"] == 3 # f1, f2, f3 merged
- assert chr1_results.iloc[1]["feature_count"] == 1 # f4 alone
-
- def test_stranded_merge(self, duckdb_stranded_engine, to_df):
- """Test MERGE with stranded=true."""
- result = to_df(
- duckdb_stranded_engine.execute("""
- SELECT MERGE(interval, stranded=true)
- FROM stranded_features
- """)
- )
-
- # + strand: f1,f2 merge -> [100,250], f5 stays -> [400,500]
- # - strand: f3,f4 merge -> [200,350]
- assert len(result) == 3
-
- plus_strand = result[result["strand"] == "+"].sort_values("start_pos")
- assert len(plus_strand) == 2
- assert plus_strand.iloc[0]["start_pos"] == 100
- assert plus_strand.iloc[0]["end_pos"] == 250
- assert plus_strand.iloc[1]["start_pos"] == 400
- assert plus_strand.iloc[1]["end_pos"] == 500
-
- minus_strand = result[result["strand"] == "-"]
- assert len(minus_strand) == 1
- assert minus_strand.iloc[0]["start_pos"] == 200
- assert minus_strand.iloc[0]["end_pos"] == 350
-
- def test_merge_in_cte(self, duckdb_cluster_engine, to_df):
- """Test MERGE operation inside a CTE."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- WITH merged_intervals AS (
- SELECT MERGE(interval)
- FROM features
- )
- SELECT *
- FROM merged_intervals
- ORDER BY chromosome, start_pos
- """)
- )
-
- # Should have same results as basic merge
- assert len(result) == 4
-
- chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos")
- assert len(chr1_results) == 2
- assert chr1_results.iloc[0]["start_pos"] == 100
- assert chr1_results.iloc[0]["end_pos"] == 500
-
- def test_merge_in_cte_with_aggregation_and_filter(
- self, duckdb_cluster_engine, to_df
- ):
- """Test MERGE in CTE with aggregation and filtering in outer query."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- WITH merged_intervals AS (
- SELECT
- MERGE(interval),
- COUNT(*) as interval_count
- FROM features
- )
- SELECT *
- FROM merged_intervals
- WHERE interval_count > 1
- ORDER BY chromosome, start_pos
- """)
- )
-
- # Only chr1's first merged interval has count > 1 (3 intervals merged)
- assert len(result) == 1
- assert result.iloc[0]["chromosome"] == "chr1"
- assert result.iloc[0]["start_pos"] == 100
- assert result.iloc[0]["end_pos"] == 500
- assert result.iloc[0]["interval_count"] == 3
-
- def test_merge_in_cte_with_distance_and_aggregation(
- self, duckdb_cluster_engine, to_df
- ):
- """Test MERGE with distance parameter in CTE with aggregation."""
- result = to_df(
- duckdb_cluster_engine.execute("""
- WITH merged_intervals AS (
- SELECT
- MERGE(interval, 100),
- COUNT(*) as interval_count,
- AVG(id) as avg_id
- FROM features
- )
- SELECT *
- FROM merged_intervals
- WHERE interval_count >= 2
- ORDER BY chromosome, start_pos
- """)
- )
-
- # With distance=100, chr1 all 4 features merge, chr2 features also merge
- # (gap between chr2 features is exactly 100bp)
- assert len(result) == 2
-
- # Check chr1 merged interval
- chr1_result = result[result["chromosome"] == "chr1"].iloc[0]
- assert chr1_result["interval_count"] == 4
- assert chr1_result["start_pos"] == 100
- assert chr1_result["end_pos"] == 1000
-
- # Check chr2 merged interval
- chr2_result = result[result["chromosome"] == "chr2"].iloc[0]
- assert chr2_result["interval_count"] == 2
- assert chr2_result["start_pos"] == 100
- assert chr2_result["end_pos"] == 400
diff --git a/tests/test_distance_transpilation.py b/tests/test_distance_transpilation.py
index 77405cc..7b79011 100644
--- a/tests/test_distance_transpilation.py
+++ b/tests/test_distance_transpilation.py
@@ -1,14 +1,12 @@
"""Transpilation tests for DISTANCE operator SQL generation.
-Tests verify that DISTANCE() is correctly transpiled to SQL CASE expressions
-across different SQL dialects (DuckDB, SQLite, PostgreSQL).
+Tests verify that DISTANCE() is correctly transpiled to SQL CASE expressions.
"""
from sqlglot import parse_one
from giql.dialect import GIQLDialect
from giql.generators import BaseGIQLGenerator
-from giql.generators import GIQLDuckDBGenerator
class TestDistanceTranspilation:
@@ -26,10 +24,10 @@ def test_distance_transpilation_duckdb(self):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator()
+ generator = BaseGIQLGenerator()
output = generator.generate(ast)
- expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b"""
+ expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a CROSS JOIN features_b AS b"""
assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}"
@@ -48,7 +46,7 @@ def test_distance_transpilation_sqlite(self):
generator = BaseGIQLGenerator()
output = generator.generate(ast)
- expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a, features_b AS b"""
+ expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a, features_b AS b"""
assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}"
@@ -67,7 +65,7 @@ def test_distance_transpilation_postgres(self):
generator = BaseGIQLGenerator()
output = generator.generate(ast)
- expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b"""
+ expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a CROSS JOIN features_b AS b"""
assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}"
@@ -84,16 +82,16 @@ def test_distance_transpilation_signed_duckdb(self):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator()
+ generator = BaseGIQLGenerator()
output = generator.generate(ast)
# Signed distance: upstream (B before A) returns negative,
# downstream (B after A) returns positive
expected = (
- 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
- 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 '
- 'WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") '
- 'ELSE -(a."start_pos" - b."end_pos") END AS dist '
+ 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL '
+ 'WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 '
+ 'WHEN a."end" <= b."start" THEN (b."start" - a."end") '
+ 'ELSE -(a."start" - b."end") END AS dist '
"FROM features_a AS a CROSS JOIN features_b AS b"
)
diff --git a/tests/test_distance_udf.py b/tests/test_distance_udf.py
index 3048c33..ee8f624 100644
--- a/tests/test_distance_udf.py
+++ b/tests/test_distance_udf.py
@@ -26,9 +26,9 @@ def test_overlapping_intervals_return_zero(self):
SELECT
DISTANCE(a.interval, b.interval) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos) b
+ (SELECT 'chr1' as chrom, 150 as start, 250 as end) b
"""
# Parse and generate SQL
@@ -64,9 +64,9 @@ def test_non_overlapping_intervals_return_positive_distance(self):
SELECT
DISTANCE(a.interval, b.interval) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -91,9 +91,9 @@ def test_different_chromosomes_return_null(self):
SELECT
DISTANCE(a.interval, b.interval) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end) a
CROSS JOIN
- (SELECT 'chr2' as chromosome, 150 as start_pos, 250 as end_pos) b
+ (SELECT 'chr2' as chrom, 150 as start, 250 as end) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -122,9 +122,9 @@ def test_adjacent_bookended_intervals_return_zero(self):
SELECT
DISTANCE(a.interval, b.interval) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 200 as start_pos, 300 as end_pos) b
+ (SELECT 'chr1' as chrom, 200 as start, 300 as end) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -154,9 +154,9 @@ def test_zero_width_intervals_point_features(self):
SELECT
DISTANCE(a.interval, b.interval) as distance
FROM
- (SELECT 'chr1' as chromosome, 150 as start_pos, 150 as end_pos) a
+ (SELECT 'chr1' as chrom, 150 as start, 150 as end) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -185,9 +185,9 @@ def test_stranded_same_strand_plus(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '+' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -212,9 +212,9 @@ def test_stranded_same_strand_minus(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '-' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -239,9 +239,9 @@ def test_stranded_different_strands_calculates_distance(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '+' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '-' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -266,9 +266,9 @@ def test_stranded_different_strands_minus_first(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -293,9 +293,9 @@ def test_stranded_dot_strand_returns_null(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '.' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '.' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '.' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '.' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -320,9 +320,9 @@ def test_stranded_question_mark_strand_returns_null(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '?' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '?' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -347,9 +347,9 @@ def test_stranded_null_strand_returns_null(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, NULL as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, NULL as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b
+ (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
@@ -374,9 +374,9 @@ def test_stranded_overlapping_intervals_minus_strand(self):
SELECT
DISTANCE(a.interval, b.interval, stranded=true) as distance
FROM
- (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a
+ (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a
CROSS JOIN
- (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos, '-' as strand) b
+ (SELECT 'chr1' as chrom, 150 as start, 250 as end, '-' as strand) b
"""
ast = parse_one(sql, dialect=GIQLDialect)
diff --git a/tests/test_engine.py b/tests/test_engine.py
deleted file mode 100644
index 2cff3a1..0000000
--- a/tests/test_engine.py
+++ /dev/null
@@ -1,480 +0,0 @@
-import tempfile
-
-from hypothesis import given
-from hypothesis import settings
-from hypothesis import strategies as st
-
-from giql import GIQLEngine
-
-
-class TestGIQLEngine:
- def test_engine_initialization_duckdb(self):
- """
- GIVEN GIQLEngine with duckdb dialect
- WHEN initializing engine
- THEN should create connection successfully
- """
- engine = GIQLEngine(target_dialect="duckdb")
- assert engine.target_dialect == "duckdb"
- assert engine.conn is not None
- engine.close()
-
- def test_engine_initialization_sqlite(self):
- """
- GIVEN GIQLEngine with sqlite dialect
- WHEN initializing engine
- THEN should create connection successfully
- """
- with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
- engine = GIQLEngine(target_dialect="sqlite", db_path=tmp.name)
- assert engine.target_dialect == "sqlite"
- assert engine.conn is not None
- engine.close()
-
- def test_engine_context_manager(self):
- """
- GIVEN GIQLEngine used as context manager
- WHEN exiting context
- THEN should close connection automatically
- """
- with GIQLEngine() as engine:
- assert engine.conn is not None
-
- def test_load_csv_and_query_duckdb(self, tmp_path, to_df):
- """
- GIVEN CSV data loaded into DuckDB
- WHEN executing GIQL query
- THEN should return correct results
- """
- # Create sample CSV
- csv_content = """id,chromosome,start_pos,end_pos,ref,alt
-1,chr1,1500,1600,A,T
-2,chr1,10500,10600,G,C
-3,chr2,500,600,C,G
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", str(csv_path))
-
- # Query using INTERSECTS
- cursor = engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- result = to_df(cursor)
-
- assert len(result) == 1
- assert result.iloc[0]["id"] == 1
-
- def test_load_csv_and_query_sqlite(self, tmp_path, to_df):
- """
- GIVEN CSV data loaded into SQLite
- WHEN executing GIQL query
- THEN should return correct results
- """
- # Create sample CSV
- csv_content = """id,chromosome,start_pos,end_pos,ref,alt
-1,chr1,1500,1600,A,T
-2,chr1,10500,10600,G,C
-3,chr2,500,600,C,G
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="sqlite") as engine:
- engine.load_csv("variants", str(csv_path))
-
- # Query using INTERSECTS
- result = to_df(
- engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- )
-
- assert len(result) == 1
- assert result.iloc[0]["id"] == 1
-
- def test_intersects_any_query(self, tmp_path, to_df):
- """
- GIVEN variants data
- WHEN querying with INTERSECTS ANY
- THEN should return variants overlapping any range
- """
- csv_content = """id,chromosome,start_pos,end_pos
-1,chr1,1500,1600
-2,chr1,10500,10600
-3,chr2,500,600
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", str(csv_path))
-
- result = to_df(
- engine.execute(
- "SELECT * FROM variants "
- "WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:400-700')"
- )
- )
-
- assert len(result) == 2
- assert set(result["id"]) == {1, 3}
-
- def test_contains_query(self, tmp_path, to_df):
- """
- GIVEN variants data
- WHEN querying with CONTAINS
- THEN should return variants containing the point
- """
- csv_content = """id,chromosome,start_pos,end_pos
-1,chr1,1500,1600
-2,chr1,10500,10600
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", str(csv_path))
-
- result = to_df(
- engine.execute(
- "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1550'"
- )
- )
-
- assert len(result) == 1
- assert result.iloc[0]["id"] == 1
-
- def test_within_query(self, tmp_path, to_df):
- """
- GIVEN variants data
- WHEN querying with WITHIN
- THEN should return variants within the range
- """
- csv_content = """id,chromosome,start_pos,end_pos
-1,chr1,1500,1600
-2,chr1,10500,10600
-3,chr1,15000,15100
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", str(csv_path))
-
- result = to_df(
- engine.execute(
- "SELECT * FROM variants WHERE interval WITHIN 'chr1:1000-11000'"
- )
- )
-
- assert len(result) == 2
- assert set(result["id"]) == {1, 2}
-
- def test_verbose_mode(self, tmp_path, to_df):
- """
- GIVEN engine with verbose mode
- WHEN executing query
- THEN should print transpiled SQL
- """
- csv_content = """id,chromosome,start_pos,end_pos
-1,chr1,1500,1600
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb", verbose=True) as engine:
- engine.load_csv("variants", str(csv_path))
- result = to_df(
- engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- )
- assert len(result) == 1
-
- @given(
- chrom_col=st.sampled_from(["chromosome", "chr", "chrom", "contig", "seqname"]),
- start_col=st.sampled_from(["start_pos", "start", "begin", "pos", "chromStart"]),
- end_col=st.sampled_from(["end_pos", "end", "stop", "chromEnd"]),
- strand_col=st.sampled_from(["strand", "str", "orientation", "direction"]),
- )
- def test_custom_genomic_columns(
- self, chrom_col, start_col, end_col, strand_col, to_df
- ):
- """
- GIVEN CSV data with custom genomic column names
- WHEN registering schema with custom column mappings
- THEN queries should work correctly with any valid column names
- """
- # Create temporary directory and CSV with custom column names
- with tempfile.TemporaryDirectory() as tmp_dir:
- csv_content = f"""id,{chrom_col},{start_col},{end_col},{strand_col},name
-1,chr1,1500,1600,+,variant1
-2,chr1,10500,10600,-,variant2
-3,chr2,500,600,+,variant3
-4,chr1,1400,1700,+,variant4
-"""
- csv_path = f"{tmp_dir}/custom_variants.csv"
- with open(csv_path, "w") as f:
- f.write(csv_content)
-
- with GIQLEngine(target_dialect="duckdb", verbose=False) as engine:
- engine.load_csv("variants", csv_path)
-
- # Register schema with custom column names
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- chrom_col: "VARCHAR",
- start_col: "BIGINT",
- end_col: "BIGINT",
- strand_col: "VARCHAR",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- chrom_col=chrom_col,
- start_col=start_col,
- end_col=end_col,
- strand_col=strand_col,
- )
-
- # Test INTERSECTS query
- result = to_df(
- engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- )
- assert len(result) == 2
- assert set(result["id"]) == {1, 4}
-
- # Test CLUSTER query (uses genomic columns internally)
- result = to_df(
- engine.execute(
- "SELECT *, CLUSTER(interval) AS cluster_id FROM variants ORDER BY id"
- )
- )
- assert len(result) == 4
- # Variants 1 and 4 should cluster together (overlapping on chr1)
- assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"]
- # Variant 2 should be in different cluster (no overlap)
- assert result.iloc[1]["cluster_id"] != result.iloc[0]["cluster_id"]
-
- # Test stranded CLUSTER query
- result = to_df(
- engine.execute("""SELECT *, CLUSTER(interval, stranded=TRUE) AS cluster_id
- FROM variants ORDER BY id""")
- )
- assert len(result) == 4
- # With stranded=TRUE, variants 1 and 4 should cluster together (both + and overlapping)
- assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"]
- # Note: cluster_ids are independent per (chromosome, strand) partition
- # So variants on different strands CAN have the same cluster_id number
- assert "cluster_id" in result.columns
-
- # Test MERGE query
- result = to_df(engine.execute("SELECT MERGE(interval) FROM variants"))
- # Should merge overlapping intervals
- assert len(result) >= 1
-
- @given(
- # Table 1 (variants) column names
- v_chrom_col=st.sampled_from(["chromosome", "chr", "chrom"]),
- v_start_col=st.sampled_from(["start_pos", "start", "begin"]),
- v_end_col=st.sampled_from(["end_pos", "end", "stop"]),
- # Table 2 (features) column names (use different names to ensure they're distinct)
- f_chrom_col=st.sampled_from(["seqname", "contig", "chr_name"]),
- f_start_col=st.sampled_from(["pos", "chromStart", "feature_start"]),
- f_end_col=st.sampled_from(["chromEnd", "feature_end", "terminus"]),
- )
- @settings(deadline=None)
- def test_join_with_different_schemas(
- self,
- v_chrom_col,
- v_start_col,
- v_end_col,
- f_chrom_col,
- f_start_col,
- f_end_col,
- to_df,
- ):
- """
- GIVEN two tables with different custom genomic column schemas
- WHEN joining them using INTERSECTS
- THEN queries should correctly use each table's custom column names
- """
- with tempfile.TemporaryDirectory() as tmp_dir:
- # Create variants table CSV
- variants_csv = f"""id,{v_chrom_col},{v_start_col},{v_end_col},name
-1,chr1,1500,1600,var1
-2,chr1,10500,10600,var2
-3,chr2,500,600,var3
-"""
- variants_path = f"{tmp_dir}/variants.csv"
- with open(variants_path, "w") as f:
- f.write(variants_csv)
-
- # Create features table CSV with DIFFERENT column names
- features_csv = f"""id,{f_chrom_col},{f_start_col},{f_end_col},type
-1,chr1,1000,2000,exon
-2,chr1,10000,11000,intron
-3,chr2,400,700,promoter
-"""
- features_path = f"{tmp_dir}/features.csv"
- with open(features_path, "w") as f:
- f.write(features_csv)
-
- with GIQLEngine(target_dialect="duckdb", verbose=False) as engine:
- # Load both tables
- engine.load_csv("variants", variants_path)
- engine.load_csv("features", features_path)
-
- # Register schemas with different column names
- engine.register_table_schema(
- "variants",
- {
- "id": "INTEGER",
- v_chrom_col: "VARCHAR",
- v_start_col: "BIGINT",
- v_end_col: "BIGINT",
- "name": "VARCHAR",
- },
- genomic_column="interval",
- chrom_col=v_chrom_col,
- start_col=v_start_col,
- end_col=v_end_col,
- )
-
- engine.register_table_schema(
- "features",
- {
- "id": "INTEGER",
- f_chrom_col: "VARCHAR",
- f_start_col: "BIGINT",
- f_end_col: "BIGINT",
- "type": "VARCHAR",
- },
- genomic_column="region",
- chrom_col=f_chrom_col,
- start_col=f_start_col,
- end_col=f_end_col,
- )
-
- # Test JOIN with INTERSECTS on both tables
- result = to_df(
- engine.execute("""
- SELECT v.name, f.type
- FROM variants v
- JOIN features f ON v.interval INTERSECTS f.region
- ORDER BY v.id
- """)
- )
-
- # Variant 1 (chr1:1500-1600) intersects Feature 1 (chr1:1000-2000)
- # Variant 2 (chr1:10500-10600) intersects Feature 2 (chr1:10000-11000)
- # Variant 3 (chr2:500-600) intersects Feature 3 (chr2:400-700)
- assert len(result) == 3
- assert list(result["name"]) == ["var1", "var2", "var3"]
- assert list(result["type"]) == ["exon", "intron", "promoter"]
-
- # Test LEFT JOIN to verify schema resolution works
- result = to_df(
- engine.execute("""
- SELECT v.id, v.name, f.type
- FROM variants v
- LEFT JOIN features f ON v.interval INTERSECTS f.region
- WHERE v.id = 1
- """)
- )
- assert len(result) == 1
- assert result.iloc[0]["name"] == "var1"
- assert result.iloc[0]["type"] == "exon"
-
- # Test WHERE clause with INTERSECTS on specific table
- result = to_df(
- engine.execute("""
- SELECT v.id, v.name
- FROM variants v, features f
- WHERE v.interval INTERSECTS f.region
- AND v.interval INTERSECTS 'chr1:1000-2000'
- """)
- )
- # Only variant 1 intersects both feature and the specified range
- assert len(result) == 1
- assert result.iloc[0]["name"] == "var1"
-
- def test_transpile_returns_sql_string(self):
- """
- GIVEN GIQLEngine with a GIQL query
- WHEN calling transpile()
- THEN should return SQL string without executing it
- """
- with GIQLEngine(target_dialect="duckdb") as engine:
- sql = engine.transpile(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
-
- assert isinstance(sql, str)
- assert len(sql) > 0
- assert "SELECT" in sql.upper()
- # Should contain genomic comparison logic
- assert "chromosome" in sql or "start_pos" in sql or "end_pos" in sql
-
- def test_transpile_different_dialects(self):
- """
- GIVEN GIQLEngine with different SQL dialects
- WHEN calling transpile()
- THEN should return SQL appropriate for each dialect
- """
- query = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
-
- for dialect in ["duckdb", "sqlite"]:
- with GIQLEngine(target_dialect=dialect) as engine:
- sql = engine.transpile(query)
- assert isinstance(sql, str)
- assert len(sql) > 0
- assert "SELECT" in sql.upper()
-
- def test_transpile_verbose_mode(self, tmp_path, capsys):
- """
- GIVEN GIQLEngine with verbose mode enabled
- WHEN calling transpile()
- THEN should print transpilation details
- """
- with GIQLEngine(target_dialect="duckdb", verbose=True) as engine:
- sql = engine.transpile(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
-
- captured = capsys.readouterr()
- assert "Target Dialect: duckdb" in captured.out
- assert "Original GIQL:" in captured.out
- assert "Transpiled SQL:" in captured.out
- assert isinstance(sql, str)
-
- def test_execute_uses_transpile(self, tmp_path, to_df):
- """
- GIVEN GIQLEngine after refactoring
- WHEN calling execute()
- THEN should use transpile() internally and execute correctly
- """
- csv_content = """id,chromosome,start_pos,end_pos
-1,chr1,1500,1600
-2,chr1,10500,10600
-"""
- csv_path = tmp_path / "variants.csv"
- csv_path.write_text(csv_content)
-
- with GIQLEngine(target_dialect="duckdb") as engine:
- engine.load_csv("variants", str(csv_path))
-
- # execute() should internally call transpile()
- cursor = engine.execute(
- "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'"
- )
- result = to_df(cursor)
-
- assert len(result) == 1
- assert result.iloc[0]["id"] == 1
diff --git a/tests/test_nearest_edge_cases.py b/tests/test_nearest_edge_cases.py
deleted file mode 100644
index 31556da..0000000
--- a/tests/test_nearest_edge_cases.py
+++ /dev/null
@@ -1,633 +0,0 @@
-"""Edge case tests for NEAREST operator.
-
-Tests verify correct handling of boundary conditions, error cases,
-and unusual inputs for the NEAREST operator.
-"""
-
-import pytest
-from hypothesis import assume
-from hypothesis import given
-from hypothesis import strategies as st
-
-from giql import GIQLEngine
-
-
-@pytest.fixture
-def duckdb_engine_with_edge_case_data():
- """Create DuckDB engine with data designed for edge case testing."""
- engine = GIQLEngine(target_dialect="duckdb")
-
- # Create peaks table
- engine.conn.execute("""
- CREATE TABLE peaks (
- peak_id INTEGER,
- chromosome VARCHAR,
- start_pos INTEGER,
- end_pos INTEGER
- )
- """)
-
- # Create genes table
- engine.conn.execute("""
- CREATE TABLE genes (
- gene_id INTEGER,
- gene_name VARCHAR,
- chromosome VARCHAR,
- start_pos INTEGER,
- end_pos INTEGER
- )
- """)
-
- # Insert test data
- # Peak 1: chr1:1000-1100
- # Peak 2: chr2:5000-5100 (different chromosome, no genes)
- # Peak 3: chr1:10000-10100
- engine.conn.execute("""
- INSERT INTO peaks VALUES
- (1, 'chr1', 1000, 1100),
- (2, 'chr2', 5000, 5100),
- (3, 'chr1', 10000, 10100)
- """)
-
- # Genes with specific distance relationships
- # GENE_A and GENE_B are both 500bp from Peak 1 (tie scenario)
- # GENE_C overlaps Peak 1 (distance=0)
- # GENE_D, GENE_E, GENE_F on chr1 but far from Peak 3
- engine.conn.execute("""
- INSERT INTO genes VALUES
- (1, 'GENE_A', 'chr1', 1600, 1700),
- (2, 'GENE_B', 'chr1', 400, 500),
- (3, 'GENE_C', 'chr1', 1050, 1150),
- (4, 'GENE_D', 'chr1', 10500, 10600),
- (5, 'GENE_E', 'chr1', 11000, 11100),
- (6, 'GENE_F', 'chr1', 12000, 12100)
- """)
-
- # Register schema
- engine.register_table_schema(
- "peaks",
- {
- "peak_id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
- engine.register_table_schema(
- "genes",
- {
- "gene_id": "INTEGER",
- "gene_name": "VARCHAR",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
-
- return engine
-
-
-class TestNearestEdgeCases:
- """Edge case tests for NEAREST operator."""
-
- def test_k_equals_zero(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN a NEAREST query with k=0
- WHEN executing the query
- THEN should return no results (LIMIT 0)
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=0) AS nearest
- WHERE peaks.peak_id = 1
- """)
-
- rows = cursor.fetchall()
- assert len(rows) == 0, "k=0 should return no results"
-
- def test_ties_multiple_features_same_distance(
- self, duckdb_engine_with_edge_case_data
- ):
- """
- GIVEN multiple genes at the same distance from a peak
- WHEN querying for k=1 nearest
- THEN should return at least 1 result (behavior may vary for ties)
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
- WHERE peaks.peak_id = 1
- ORDER BY nearest.distance, nearest.gene_name
- """)
-
- rows = cursor.fetchall()
-
- # Should have at least 1 result
- assert len(rows) >= 1, "Should return at least one result for k=1"
-
- # All results should be at the same distance (ties)
- # Note: GENE_A and GENE_B are both 500bp away, GENE_C overlaps (0bp)
- # So the closest should be GENE_C at distance 0
- assert rows[0][1] == "GENE_C", (
- f"Closest gene should be GENE_C (overlapping), got {rows[0][1]}"
- )
- assert rows[0][2] == 0, f"Distance should be 0 (overlap), got {rows[0][2]}"
-
- def test_empty_result_set_different_chromosome(
- self, duckdb_engine_with_edge_case_data
- ):
- """
- GIVEN a peak on a chromosome with no genes
- WHEN querying for nearest genes
- THEN should return empty result set
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest
- WHERE peaks.peak_id = 2
- """)
-
- rows = cursor.fetchall()
-
- # Peak 2 is on chr2, but all genes are on chr1
- # Should return empty result set
- assert len(rows) == 0, (
- "Should return empty result for peak on chromosome with no genes"
- )
-
- def test_overlapping_features_distance_zero(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN a gene that overlaps a peak
- WHEN querying for nearest genes
- THEN should return distance=0 for overlapping gene
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest
- WHERE peaks.peak_id = 1
- ORDER BY nearest.distance
- """)
-
- rows = cursor.fetchall()
-
- # GENE_C (chr1:1050-1150) overlaps Peak 1 (chr1:1000-1100)
- assert len(rows) > 0, "Should find genes"
-
- # First result should be the overlapping gene with distance 0
- assert rows[0][1] == "GENE_C", (
- f"First result should be GENE_C (overlapping), got {rows[0][1]}"
- )
- assert rows[0][2] == 0, (
- f"Distance should be 0 for overlapping gene, got {rows[0][2]}"
- )
-
- def test_missing_reference_in_standalone_mode(
- self, duckdb_engine_with_edge_case_data
- ):
- """
- GIVEN a standalone NEAREST query without reference parameter
- WHEN parsing/executing the query
- THEN should raise an error (reference is required in standalone mode)
- """
- engine = duckdb_engine_with_edge_case_data
-
- # Standalone mode (FROM NEAREST(...)) without reference parameter
- # This should fail because we can't determine the reference point
- with pytest.raises(Exception) as exc_info:
- engine.execute("""
- SELECT *
- FROM NEAREST(genes, k=3)
- """)
-
- # Should get an error about missing reference
- # The exact error message may vary, but it should mention reference
- error_msg = str(exc_info.value).lower()
- # Could be a ValueError, AttributeError, or SQL error depending on where it fails
- # Just verify it fails - the specific error type will be improved in T065
-
- def test_missing_target_table_in_schema(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN a NEAREST query referencing a non-existent table
- WHEN executing the query
- THEN should raise an error about missing table
- """
- engine = duckdb_engine_with_edge_case_data
-
- # Query references 'nonexistent_table' which doesn't exist
- with pytest.raises(Exception) as exc_info:
- engine.execute("""
- SELECT *
- FROM peaks
- CROSS JOIN LATERAL NEAREST(nonexistent_table, reference=peaks.interval, k=3) AS nearest
- """)
-
- # Should get an error about the missing table
- error_msg = str(exc_info.value).lower()
- # DuckDB should raise an error about the table not existing
-
- def test_invalid_literal_range_format(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN a NEAREST query with invalid literal range format
- WHEN parsing/executing the query
- THEN should raise an error about invalid range format
- """
- engine = duckdb_engine_with_edge_case_data
-
- # Invalid range formats
- # Note: "chr1:1000" is valid (point format), so not included
- invalid_ranges = [
- "chr1:not-a-number", # Non-numeric coordinates
- "invalid-format", # No colon separator
- "chr1:2000-1000", # End before start (start >= end)
- ]
-
- for invalid_range in invalid_ranges:
- with pytest.raises(ValueError) as exc_info:
- engine.execute(f"""
- SELECT *
- FROM NEAREST(genes, reference='{invalid_range}', k=3)
- """)
-
- # Should get a ValueError about invalid range format
- error_msg = str(exc_info.value).lower()
- assert "invalid" in error_msg or "must be less" in error_msg, (
- f"Error message should mention invalid format or start/end issue: {exc_info.value}"
- )
-
- def test_nearest_with_additional_where_clause(
- self, duckdb_engine_with_edge_case_data
- ):
- """
- GIVEN a NEAREST query with additional WHERE clause filtering
- WHEN executing the query
- THEN should apply both NEAREST and WHERE filters
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest
- WHERE peaks.peak_id = 1 AND nearest.distance < 600
- ORDER BY nearest.distance
- """)
-
- rows = cursor.fetchall()
-
- # Should find genes within 600bp of Peak 1
- # GENE_C overlaps (0bp) and GENE_A/GENE_B are 500bp away
- assert len(rows) >= 1, "Should find genes within 600bp"
-
- # All returned genes should have distance < 600
- for row in rows:
- assert row[2] < 600, f"All distances should be < 600bp, got {row[2]}"
-
- def test_nearest_with_cte(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN a NEAREST query using a CTE for multiple query points
- WHEN executing the query
- THEN should correctly handle NEAREST within CTE
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- WITH selected_peaks AS (
- SELECT * FROM peaks WHERE peak_id IN (1, 3)
- )
- SELECT
- selected_peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM selected_peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=selected_peaks.interval, k=2) AS nearest
- ORDER BY selected_peaks.peak_id, nearest.distance
- """)
-
- rows = cursor.fetchall()
-
- # Should find 2 nearest genes for each of 2 peaks = up to 4 results
- assert len(rows) > 0, "Should find genes for peaks in CTE"
-
- # Check that we have results for both peaks
- peak_ids = set(row[0] for row in rows)
- assert 1 in peak_ids, "Should have results for peak 1"
- assert 3 in peak_ids, "Should have results for peak 3"
-
- def test_k_greater_than_total_features_all_chromosomes(
- self, duckdb_engine_with_edge_case_data
- ):
- """
- GIVEN k greater than total number of features on the same chromosome
- WHEN querying for nearest genes
- THEN should return all available features on that chromosome
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1000) AS nearest
- WHERE peaks.peak_id = 1
- """)
-
- rows = cursor.fetchall()
-
- # Peak 1 is on chr1, and there are 6 genes on chr1
- # Should return all 6 genes, not 1000
- assert len(rows) == 6, f"Should return all 6 genes on chr1, got {len(rows)}"
-
- def test_ties_with_k_greater_than_one(self, duckdb_engine_with_edge_case_data):
- """
- GIVEN multiple features at the same distance (ties)
- WHEN querying with k that includes tied features
- THEN should handle ties consistently
- """
- engine = duckdb_engine_with_edge_case_data
-
- cursor = engine.execute("""
- SELECT
- peaks.peak_id,
- nearest.gene_name,
- nearest.distance
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
- WHERE peaks.peak_id = 1
- ORDER BY nearest.distance, nearest.gene_name
- """)
-
- rows = cursor.fetchall()
-
- # Peak 1 has:
- # - GENE_C at 0bp (overlap)
- # - GENE_A and GENE_B both at 500bp (tie)
- # With k=3, should get all 3
-
- assert len(rows) == 3, f"Should return 3 nearest genes, got {len(rows)}"
-
- # First should be GENE_C (distance 0)
- assert rows[0][1] == "GENE_C"
- assert rows[0][2] == 0
-
- # Next two should be GENE_A and GENE_B (distance 500, order may vary)
- gene_names_at_500 = [rows[1][1], rows[2][1]]
- assert set(gene_names_at_500) == {"GENE_A", "GENE_B"}, (
- f"Should have GENE_A and GENE_B at 500bp"
- )
- assert rows[1][2] == 500
- assert rows[2][2] == 500
-
-
-class TestNearestPropertyBased:
- """Property-based tests for NEAREST operator using Hypothesis."""
-
- @given(
- start1=st.integers(min_value=0, max_value=100000),
- length1=st.integers(min_value=1, max_value=1000),
- start2=st.integers(min_value=0, max_value=100000),
- length2=st.integers(min_value=1, max_value=1000),
- )
- def test_distance_non_negative_for_non_overlapping(
- self, start1, length1, start2, length2
- ):
- """
- PROPERTY: Distance between non-overlapping intervals is always non-negative
- GIVEN two non-overlapping genomic intervals
- WHEN calculating distance using NEAREST
- THEN distance should be >= 0
- """
- end1 = start1 + length1
- end2 = start2 + length2
-
- # Skip if intervals overlap
- assume(not (start1 < end2 and end1 > start2))
-
- engine = GIQLEngine(target_dialect="duckdb")
-
- # Create tables
- engine.conn.execute("""
- CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
- engine.conn.execute("""
- CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
-
- # Insert test data
- engine.conn.execute(f"""
- INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1})
- """)
- engine.conn.execute(f"""
- INSERT INTO target VALUES (1, 'chr1', {start2}, {end2})
- """)
-
- # Register schema
- engine.register_table_schema(
- "ref",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
- engine.register_table_schema(
- "target",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
-
- # Query for nearest
- cursor = engine.execute("""
- SELECT nearest.distance
- FROM ref
- CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest
- """)
-
- rows = cursor.fetchall()
- if len(rows) > 0:
- distance = rows[0][0]
- assert distance >= 0, f"Distance should be non-negative, got {distance}"
-
- @given(
- start1=st.integers(min_value=0, max_value=100000),
- length1=st.integers(min_value=1, max_value=1000),
- overlap_start=st.integers(min_value=1, max_value=500),
- )
- def test_overlapping_intervals_have_zero_distance(
- self, start1, length1, overlap_start
- ):
- """
- PROPERTY: Overlapping intervals have distance 0
- GIVEN two genomic intervals that overlap
- WHEN calculating distance using NEAREST
- THEN distance should be 0
- """
- end1 = start1 + length1
- # Create overlapping interval
- start2 = start1 + overlap_start
- end2 = start2 + length1
-
- # Ensure they actually overlap
- assume(start1 < end2 and end1 > start2)
-
- engine = GIQLEngine(target_dialect="duckdb")
-
- # Create tables
- engine.conn.execute("""
- CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
- engine.conn.execute("""
- CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
-
- # Insert test data
- engine.conn.execute(f"""
- INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1})
- """)
- engine.conn.execute(f"""
- INSERT INTO target VALUES (1, 'chr1', {start2}, {end2})
- """)
-
- # Register schema
- engine.register_table_schema(
- "ref",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
- engine.register_table_schema(
- "target",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
-
- # Query for nearest
- cursor = engine.execute("""
- SELECT nearest.distance
- FROM ref
- CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest
- """)
-
- rows = cursor.fetchall()
- assert len(rows) > 0, "Should find overlapping interval"
- distance = rows[0][0]
- assert distance == 0, (
- f"Overlapping intervals should have distance 0, got {distance}"
- )
-
- @given(
- k=st.integers(min_value=1, max_value=10),
- n_features=st.integers(min_value=0, max_value=15),
- )
- def test_k_parameter_returns_at_most_k_results(self, k, n_features):
- """
- PROPERTY: k parameter limits results to at most k features
- GIVEN k parameter and n available features
- WHEN querying for k nearest
- THEN should return min(k, n) results
- """
- engine = GIQLEngine(target_dialect="duckdb")
-
- # Create tables
- engine.conn.execute("""
- CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
- engine.conn.execute("""
- CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER)
- """)
-
- # Insert reference point
- engine.conn.execute("""
- INSERT INTO ref VALUES (1, 'chr1', 1000, 1100)
- """)
-
- # Insert n_features target features
- for i in range(n_features):
- # Spread features out to avoid ties
- start = 2000 + (i * 500)
- end = start + 100
- engine.conn.execute(f"""
- INSERT INTO target VALUES ({i}, 'chr1', {start}, {end})
- """)
-
- # Register schema
- engine.register_table_schema(
- "ref",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
- engine.register_table_schema(
- "target",
- {
- "id": "INTEGER",
- "chromosome": "VARCHAR",
- "start_pos": "INTEGER",
- "end_pos": "INTEGER",
- },
- genomic_column="interval",
- )
-
- # Query for k nearest
- cursor = engine.execute(f"""
- SELECT COUNT(*)
- FROM ref
- CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k={k}) AS nearest
- """)
-
- rows = cursor.fetchall()
- count = rows[0][0]
-
- # Should return at most k results, but not more than available features
- expected_count = min(k, n_features)
- assert count == expected_count, (
- f"Expected {expected_count} results (min({k}, {n_features})), got {count}"
- )
diff --git a/tests/test_nearest_transpilation.py b/tests/test_nearest_transpilation.py
index 91618b6..de57c98 100644
--- a/tests/test_nearest_transpilation.py
+++ b/tests/test_nearest_transpilation.py
@@ -1,64 +1,34 @@
"""Transpilation tests for NEAREST operator SQL generation.
-Tests verify that NEAREST() is correctly transpiled to dialect-specific SQL
-(LATERAL joins for PostgreSQL/DuckDB, window functions for SQLite).
+Tests verify that NEAREST() is correctly transpiled to SQL
+(LATERAL joins for correlated queries, ORDER BY + LIMIT for standalone).
"""
import pytest
from sqlglot import parse_one
+from giql import Table
from giql.dialect import GIQLDialect
from giql.generators import BaseGIQLGenerator
-from giql.generators import GIQLDuckDBGenerator
-from giql.schema import ColumnInfo
-from giql.schema import SchemaInfo
-from giql.schema import TableSchema
+from giql.table import Tables
@pytest.fixture
-def schema_with_peaks_and_genes():
- """Schema info with peaks and genes tables."""
- schema = SchemaInfo()
+def tables_with_peaks_and_genes():
+ """Tables container with peaks and genes tables."""
+ tables = Tables()
+ tables.register("peaks", Table("peaks"))
+ tables.register("genes", Table("genes"))
+ return tables
- # Register peaks table
- peaks_table = TableSchema(name="peaks", columns={})
- peaks_table.columns["peak_id"] = ColumnInfo(name="peak_id", type="INTEGER")
- peaks_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["peaks"] = peaks_table
- # Register genes table
- genes_table = TableSchema(name="genes", columns={})
- genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER")
- genes_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR")
- genes_table.columns["interval"] = ColumnInfo(
- name="interval",
- type="VARCHAR",
- is_genomic=True,
- chrom_col="chromosome",
- start_col="start_pos",
- end_col="end_pos",
- strand_col="strand",
- )
- schema.tables["genes"] = genes_table
+class TestNearestTranspilation:
+ """Tests for NEAREST transpilation to SQL."""
- return schema
-
-
-class TestNearestTranspilationDuckDB:
- """Tests for NEAREST transpilation to DuckDB SQL (LATERAL joins)."""
-
- def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes):
+ def test_nearest_basic_k3(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQL query with NEAREST(genes, k=3)
- WHEN transpiling to DuckDB SQL
+ WHEN transpiling to SQL
THEN should generate LATERAL join with DISTANCE and LIMIT 3
"""
sql = """
@@ -68,7 +38,7 @@ def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Expectations:
@@ -83,10 +53,10 @@ def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes):
assert "LIMIT 3" in output
assert "ORDER BY" in output
- def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes):
+ def test_nearest_with_max_distance(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000)
- WHEN transpiling to DuckDB SQL
+ WHEN transpiling to SQL
THEN should generate LATERAL join with distance filter
"""
sql = """
@@ -96,7 +66,7 @@ def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Expectations:
@@ -107,10 +77,10 @@ def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes):
assert "100000" in output
assert "LIMIT 5" in output
- def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes):
+ def test_nearest_standalone_literal(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3)
- WHEN transpiling to DuckDB SQL
+ WHEN transpiling to SQL
THEN should generate standalone query without LATERAL
"""
sql = """
@@ -119,7 +89,7 @@ def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Expectations:
@@ -131,10 +101,10 @@ def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes):
assert "chr1" in output.lower()
assert "LIMIT 3" in output
- def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes):
+ def test_nearest_with_stranded(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true)
- WHEN transpiling to DuckDB SQL
+ WHEN transpiling to SQL
THEN should generate SQL with strand filtering
"""
sql = """
@@ -144,7 +114,7 @@ def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Expectations:
@@ -155,10 +125,10 @@ def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes):
assert "strand" in output.lower()
assert "LIMIT 3" in output
- def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes):
+ def test_nearest_with_signed(self, tables_with_peaks_and_genes):
"""
GIVEN a GIQL query with NEAREST(genes, k=3, signed=true)
- WHEN transpiling to DuckDB SQL
+ WHEN transpiling to SQL
THEN should generate SQL with signed distance column
(negative for upstream, positive for downstream)
"""
@@ -169,7 +139,7 @@ def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes):
"""
ast = parse_one(sql, dialect=GIQLDialect)
- generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes)
output = generator.generate(ast)
# Expectations:
@@ -183,114 +153,3 @@ def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes):
assert "ELSE -(" in output, (
f"Expected signed distance with negation for upstream, got:\n{output}"
)
-
-
-# PostgreSQL uses same generator as base for now
-# class TestNearestTranspilationPostgreSQL:
-# """Tests for NEAREST transpilation to PostgreSQL SQL (LATERAL joins)."""
-# (Skipped - uses BaseGIQLGenerator for now)
-
-
-class TestNearestTranspilationSQLite:
- """Tests for NEAREST transpilation to SQLite SQL (using LATERAL for MVP)."""
-
- def test_nearest_basic_k3_sqlite(self, schema_with_peaks_and_genes):
- """
- GIVEN a GIQL query with NEAREST(genes, k=3)
- WHEN transpiling to SQLite SQL
- THEN should generate LATERAL subquery with ORDER BY and LIMIT
- (Note: Using LATERAL for MVP - window function optimization to be added later)
- """
- sql = """
- SELECT *
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3)
- """
-
- ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
- output = generator.generate(ast)
-
- # MVP expectations (LATERAL syntax):
- # - LATERAL subquery
- # - Distance calculation (CASE WHEN)
- # - ORDER BY distance
- # - LIMIT 3
- assert "LATERAL" in output.upper()
- assert "CASE" in output.upper()
- assert " AS distance" in output or " AS DISTANCE" in output
- assert "ORDER BY" in output.upper()
- assert "LIMIT 3" in output
-
- def test_nearest_with_max_distance_sqlite(self, schema_with_peaks_and_genes):
- """
- GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000)
- WHEN transpiling to SQLite SQL
- THEN should generate LATERAL with distance filter
- (Note: Using LATERAL for MVP - window function optimization to be added later)
- """
- sql = """
- SELECT *
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5, max_distance=100000)
- """
-
- ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
- output = generator.generate(ast)
-
- # MVP expectations (LATERAL syntax):
- # - LATERAL subquery
- # - Distance filter: <= 100000
- # - LIMIT 5
- assert "LATERAL" in output.upper()
- assert "100000" in output
- assert "LIMIT 5" in output
-
- def test_nearest_standalone_literal_sqlite(self, schema_with_peaks_and_genes):
- """
- GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3)
- WHEN transpiling to SQLite SQL
- THEN should generate standalone query without window functions
- """
- sql = """
- SELECT *
- FROM NEAREST(genes, reference='chr1:1000-2000', k=3)
- """
-
- ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
- output = generator.generate(ast)
-
- # Expectations:
- # - No CTE needed (standalone mode)
- # - Distance calculation with literal 'chr1', 1000, 2000
- # - ORDER BY distance
- # - LIMIT 3
- assert "chr1" in output.lower()
- assert "ORDER BY" in output.upper()
- assert "LIMIT 3" in output
-
- def test_nearest_with_stranded_sqlite(self, schema_with_peaks_and_genes):
- """
- GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true)
- WHEN transpiling to SQLite SQL
- THEN should generate SQL with strand filtering
- """
- sql = """
- SELECT *
- FROM peaks
- CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3, stranded=true)
- """
-
- ast = parse_one(sql, dialect=GIQLDialect)
- generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes)
- output = generator.generate(ast)
-
- # Expectations:
- # - LATERAL subquery
- # - Strand filtering in WHERE clause
- # - LIMIT 3
- assert "LATERAL" in output.upper()
- assert "strand" in output.lower()
- assert "LIMIT 3" in output
diff --git a/tests/test_transpile.py b/tests/test_transpile.py
new file mode 100644
index 0000000..ea7ed8b
--- /dev/null
+++ b/tests/test_transpile.py
@@ -0,0 +1,412 @@
+"""Tests for the transpile() function."""
+
+import pytest
+
+import giql
+from giql import Table
+from giql import transpile
+
+
+class TestTranspileBasic:
+ """Tests for basic transpilation with string table names."""
+
+ def test_transpile_intersects_literal(self):
+ """
+ GIVEN a GIQL query with INTERSECTS and literal range
+ WHEN transpiling with string table name
+ THEN should return valid SQL with default column names
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+ assert "chrom" in sql
+ assert "start" in sql
+ assert "end" in sql
+ assert "chr1" in sql
+
+ def test_transpile_contains_literal(self):
+ """
+ GIVEN a GIQL query with CONTAINS and literal point
+ WHEN transpiling with string table name
+ THEN should return valid SQL for point containment
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval CONTAINS 'chr1:1500'",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+ assert "1500" in sql
+
+ def test_transpile_within_literal(self):
+ """
+ GIVEN a GIQL query with WITHIN and literal range
+ WHEN transpiling with string table name
+ THEN should return valid SQL for interval within range
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval WITHIN 'chr1:1000-2000'",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+
+ def test_transpile_no_tables(self):
+ """
+ GIVEN a GIQL query with INTERSECTS
+ WHEN transpiling with no tables parameter
+ THEN should return valid SQL with default column names
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+
+
+class TestTranspileWithTableObjects:
+ """Tests for transpilation with Table objects."""
+
+ def test_transpile_custom_columns(self):
+ """
+ GIVEN a GIQL query with INTERSECTS
+ WHEN transpiling with custom column mappings
+ THEN should use custom column names in generated SQL
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=[
+ Table(
+ "peaks",
+ genomic_col="interval",
+ chrom_col="chromosome",
+ start_col="start_pos",
+ end_col="end_pos",
+ )
+ ],
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+ assert '"chromosome"' in sql
+ assert '"start_pos"' in sql
+ assert '"end_pos"' in sql
+ # Should NOT contain default column names
+ assert '"chrom"' not in sql
+ assert '"start"' not in sql
+ assert '"end"' not in sql
+
+ def test_transpile_no_strand_column(self):
+ """
+ GIVEN a Table with strand_col=None
+ WHEN transpiling a query
+ THEN should not require strand column
+ """
+ sql = transpile(
+ "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'",
+ tables=[Table("peaks", strand_col=None)],
+ )
+
+ assert "SELECT" in sql
+
+
+class TestTranspileMultipleTables:
+ """Tests for transpilation with multiple tables."""
+
+ def test_transpile_join_intersects(self):
+ """
+ GIVEN a GIQL query joining two tables with INTERSECTS
+ WHEN transpiling with both tables configured
+ THEN should generate correct join conditions
+ """
+ sql = transpile(
+ """
+ SELECT a.*, b.*
+ FROM peaks a
+ JOIN genes b ON a.interval INTERSECTS b.region
+ """,
+ tables=[
+ Table("peaks", genomic_col="interval"),
+ Table("genes", genomic_col="region"),
+ ],
+ )
+
+ assert "SELECT" in sql
+ assert "peaks" in sql
+ assert "genes" in sql
+ assert "JOIN" in sql.upper()
+
+ def test_transpile_different_schemas(self):
+ """
+ GIVEN two tables with different column schemas
+ WHEN transpiling a join query
+ THEN should use correct columns for each table
+ """
+ sql = transpile(
+ """
+ SELECT a.*, b.*
+ FROM peaks a
+ JOIN features b ON a.interval INTERSECTS b.location
+ """,
+ tables=[
+ Table(
+ "peaks",
+ genomic_col="interval",
+ chrom_col="chromosome",
+ start_col="start_pos",
+ end_col="end_pos",
+ ),
+ Table(
+ "features",
+ genomic_col="location",
+ chrom_col="seqname",
+ start_col="begin",
+ end_col="terminus",
+ ),
+ ],
+ )
+
+ assert "SELECT" in sql
+ # Both table's column names should appear
+ assert "chromosome" in sql or "start_pos" in sql
+ assert "seqname" in sql or "begin" in sql or "terminus" in sql
+
+
+class TestTranspileSpatialOperators:
+ """Tests for all spatial operators."""
+
+ def test_intersects_any(self):
+ """
+ GIVEN a GIQL query with INTERSECTS ANY
+ WHEN transpiling
+ THEN should generate OR conditions for multiple ranges
+ """
+ sql = transpile(
+ """
+ SELECT * FROM peaks
+ WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:500-1000')
+ """,
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "chr1" in sql
+ assert "chr2" in sql
+ assert " OR " in sql
+
+ def test_intersects_all(self):
+ """
+ GIVEN a GIQL query with INTERSECTS ALL
+ WHEN transpiling
+ THEN should generate AND conditions for multiple ranges
+ """
+ sql = transpile(
+ """
+ SELECT * FROM peaks
+ WHERE interval INTERSECTS ALL('chr1:1000-2000', 'chr1:1500-2500')
+ """,
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert " AND " in sql
+
+
+class TestTranspileCluster:
+ """Tests for CLUSTER operation."""
+
+ def test_cluster_basic(self):
+ """
+ GIVEN a GIQL query with CLUSTER
+ WHEN transpiling
+ THEN should generate window function for clustering
+ """
+ sql = transpile(
+ """
+ SELECT *, CLUSTER(interval) AS cluster_id
+ FROM peaks
+ """,
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "SUM" in sql.upper() or "LAG" in sql.upper()
+
+ def test_cluster_with_distance(self):
+ """
+ GIVEN a GIQL query with CLUSTER and distance parameter
+ WHEN transpiling
+ THEN should include distance in clustering logic
+ """
+ sql = transpile(
+ """
+ SELECT *, CLUSTER(interval, 100) AS cluster_id
+ FROM peaks
+ """,
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "100" in sql
+
+ def test_cluster_stranded(self):
+ """
+ GIVEN a GIQL query with stranded CLUSTER
+ WHEN transpiling
+ THEN should partition by strand
+ """
+ sql = transpile(
+ """
+ SELECT *, CLUSTER(interval, stranded=true) AS cluster_id
+ FROM peaks
+ """,
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "strand" in sql.lower()
+
+
+class TestTranspileMerge:
+ """Tests for MERGE operation."""
+
+ def test_merge_basic(self):
+ """
+ GIVEN a GIQL query with MERGE
+ WHEN transpiling
+ THEN should generate GROUP BY with MIN/MAX aggregation
+ """
+ sql = transpile(
+ "SELECT MERGE(interval) FROM peaks",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "MIN" in sql.upper()
+ assert "MAX" in sql.upper()
+ assert "GROUP BY" in sql.upper()
+
+ def test_merge_with_distance(self):
+ """
+ GIVEN a GIQL query with MERGE and distance parameter
+ WHEN transpiling
+ THEN should include distance in merge logic
+ """
+ sql = transpile(
+ "SELECT MERGE(interval, 100) FROM peaks",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "100" in sql
+
+ def test_merge_with_aggregation(self):
+ """
+ GIVEN a GIQL query with MERGE and additional aggregation
+ WHEN transpiling
+ THEN should include both merge and custom aggregation
+ """
+ sql = transpile(
+ "SELECT MERGE(interval), COUNT(*) as count FROM peaks",
+ tables=["peaks"],
+ )
+
+ assert "SELECT" in sql
+ assert "COUNT" in sql.upper()
+
+
+class TestTranspileNearest:
+ """Tests for NEAREST operation."""
+
+ def test_nearest_standalone(self):
+ """
+ GIVEN a GIQL query with standalone NEAREST
+ WHEN transpiling
+ THEN should generate subquery with ORDER BY and LIMIT
+ """
+ sql = transpile(
+ "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)",
+ tables=["genes"],
+ )
+
+ assert "SELECT" in sql
+ assert "ORDER BY" in sql.upper()
+ assert "LIMIT 3" in sql
+
+ def test_nearest_with_max_distance(self):
+ """
+ GIVEN a GIQL query with NEAREST and max_distance
+ WHEN transpiling
+ THEN should include distance filter
+ """
+ sql = transpile(
+ """
+ SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=5, max_distance=100000)
+ """,
+ tables=["genes"],
+ )
+
+ assert "SELECT" in sql
+ assert "100000" in sql
+ assert "LIMIT 5" in sql
+
+ def test_nearest_lateral(self):
+ """
+ GIVEN a GIQL query with NEAREST in LATERAL join
+ WHEN transpiling
+ THEN should generate LATERAL subquery
+ """
+ sql = transpile(
+ """
+ SELECT *
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3)
+ """,
+ tables=["peaks", "genes"],
+ )
+
+ assert "SELECT" in sql
+ assert "LATERAL" in sql.upper()
+ assert "LIMIT 3" in sql
+
+
+class TestTranspileErrors:
+ """Tests for error handling."""
+
+ def test_invalid_syntax(self):
+ """
+ GIVEN an invalid GIQL query
+ WHEN transpiling
+ THEN should raise ValueError with parse error
+ """
+ with pytest.raises(ValueError, match="Parse error"):
+ transpile("SELECT * FORM peaks") # typo: FORM instead of FROM
+
+
+class TestModuleExports:
+ """Tests for module-level exports."""
+
+ def test_transpile_exported(self):
+ """
+ GIVEN the giql module
+ WHEN accessing transpile
+ THEN should be available at module level
+ """
+ assert hasattr(giql, "transpile")
+ assert callable(giql.transpile)
+
+ def test_table_exported(self):
+ """
+ GIVEN the giql module
+ WHEN accessing Table
+ THEN should be available at module level
+ """
+ assert hasattr(giql, "Table")
+ assert giql.Table is Table