diff --git a/.coveragerc b/.coveragerc
index e68a7a0..78af58b 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -6,4 +6,4 @@ omit =
[report]
show_missing = true
precision = 2
-fail_under = 73.40
+fail_under = 70
diff --git a/.github/workflows/validate-pr.yaml b/.github/workflows/validate-pr.yaml
deleted file mode 100644
index 7ebceec..0000000
--- a/.github/workflows/validate-pr.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Validate pull request
-
-on:
- pull_request:
- branches:
- - master
- - main
- - release
- types:
- - edited
- - opened
- - reopened
- - synchronize
- paths:
- - src/**
- - pyproject.toml
-
-jobs:
- merge-forbidden:
- name: Merge forbidden
- runs-on: ubuntu-latest
- if: ${{ github.base_ref == 'master' && github.head_ref != 'release' }}
- steps:
- - run: |
- echo "ERROR: Merging code changes directly into master forbidden."
- exit 1
diff --git a/demo.ipynb b/demo.ipynb
index 4ab1d96..e7f02c3 100644
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -143,9 +143,9 @@
"# Create GIQL engine with DuckDB backend\n",
"engine = GIQLEngine(target_dialect=\"duckdb\", verbose=False)\n",
"\n",
- "# Register DataFrames\n",
- "engine.conn.register(\"features_a\", features_a.to_pandas())\n",
- "engine.conn.register(\"features_b\", features_b.to_pandas())\n",
+ "# Register DataFrames using Arrow (preferred for DuckDB)\n",
+ "engine.conn.register(\"features_a\", features_a.to_arrow())\n",
+ "engine.conn.register(\"features_b\", features_b.to_arrow())\n",
"\n",
"# Register schemas with genomic columns\n",
"schema_dict = {\n",
@@ -162,7 +162,7 @@
"}\n",
"\n",
"for table in [\"features_a\", \"features_b\"]:\n",
- " engine.register_table_schema(table, schema_dict, genomic_column=\"position\")\n",
+ " engine.register_table_schema(table, schema_dict, genomic_column=\"interval\")\n",
"\n",
"print(\"GIQL engine configured successfully\")"
]
@@ -177,7 +177,7 @@
"\n",
"Find features in A that **overlap** with features in B.\n",
"\n",
- "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position INTERSECTS b.position`"
+ "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval INTERSECTS b.interval`"
]
},
{
@@ -239,133 +239,133 @@
"
\n",
" \n",
" | 0 | \n",
- " chr22 | \n",
- " 19953701 | \n",
- " 19953957 | \n",
+ " chr11 | \n",
+ " 34359322 | \n",
+ " 34359560 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 390.93901 | \n",
+ " 220.19056 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 131 | \n",
+ " 117 | \n",
"
\n",
" \n",
" | 1 | \n",
- " chr13 | \n",
- " 75549275 | \n",
- " 75549497 | \n",
+ " chr17 | \n",
+ " 18625675 | \n",
+ " 18625936 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 314.02895 | \n",
+ " 218.02293 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 111 | \n",
+ " 133 | \n",
"
\n",
" \n",
" | 2 | \n",
- " chr20 | \n",
- " 52937930 | \n",
- " 52938139 | \n",
+ " chr16 | \n",
+ " 67883801 | \n",
+ " 67884000 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 242.96853 | \n",
+ " 219.30369 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 110 | \n",
+ " 102 | \n",
"
\n",
" \n",
" | 3 | \n",
- " chr16 | \n",
- " 77584318 | \n",
- " 77584523 | \n",
+ " chr14 | \n",
+ " 94036611 | \n",
+ " 94036828 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 237.00123 | \n",
+ " 220.16568 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 107 | \n",
+ " 116 | \n",
"
\n",
" \n",
" | 4 | \n",
- " chr16 | \n",
- " 81403612 | \n",
- " 81403816 | \n",
+ " chr8 | \n",
+ " 110060410 | \n",
+ " 110060629 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 244.89039 | \n",
+ " 219.42024 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 103 | \n",
+ " 106 | \n",
"
\n",
" \n",
" | 5 | \n",
" chr9 | \n",
- " 97411765 | \n",
- " 97412044 | \n",
+ " 135967630 | \n",
+ " 135967829 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 328.74736 | \n",
+ " 218.66102 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 129 | \n",
+ " 92 | \n",
"
\n",
" \n",
" | 6 | \n",
- " chr9 | \n",
- " 137236518 | \n",
- " 137236806 | \n",
+ " chr14 | \n",
+ " 94136789 | \n",
+ " 94136976 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 323.80804 | \n",
+ " 218.30192 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 138 | \n",
+ " 99 | \n",
"
\n",
" \n",
" | 7 | \n",
" chr20 | \n",
- " 63537989 | \n",
- " 63538200 | \n",
+ " 24918423 | \n",
+ " 24918598 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 272.24062 | \n",
+ " 218.92314 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 98 | \n",
+ " 94 | \n",
"
\n",
" \n",
" | 8 | \n",
- " chr10 | \n",
- " 86377275 | \n",
- " 86377511 | \n",
+ " chr2 | \n",
+ " 202178449 | \n",
+ " 202178624 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 256.93284 | \n",
+ " 217.36688 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 125 | \n",
+ " 90 | \n",
"
\n",
" \n",
" | 9 | \n",
- " chr15 | \n",
- " 77633500 | \n",
- " 77633787 | \n",
+ " chr7 | \n",
+ " 120987697 | \n",
+ " 120987871 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 380.81693 | \n",
+ " 217.34347 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 140 | \n",
+ " 85 | \n",
"
\n",
" \n",
"\n",
@@ -373,28 +373,28 @@
],
"text/plain": [
" chromosome start_pos end_pos name score strand signal_value p_value \\\n",
- "0 chr22 19953701 19953957 . 1000 . 390.93901 -1.0 \n",
- "1 chr13 75549275 75549497 . 1000 . 314.02895 -1.0 \n",
- "2 chr20 52937930 52938139 . 1000 . 242.96853 -1.0 \n",
- "3 chr16 77584318 77584523 . 1000 . 237.00123 -1.0 \n",
- "4 chr16 81403612 81403816 . 1000 . 244.89039 -1.0 \n",
- "5 chr9 97411765 97412044 . 1000 . 328.74736 -1.0 \n",
- "6 chr9 137236518 137236806 . 1000 . 323.80804 -1.0 \n",
- "7 chr20 63537989 63538200 . 1000 . 272.24062 -1.0 \n",
- "8 chr10 86377275 86377511 . 1000 . 256.93284 -1.0 \n",
- "9 chr15 77633500 77633787 . 1000 . 380.81693 -1.0 \n",
+ "0 chr11 34359322 34359560 . 1000 . 220.19056 -1.0 \n",
+ "1 chr17 18625675 18625936 . 1000 . 218.02293 -1.0 \n",
+ "2 chr16 67883801 67884000 . 1000 . 219.30369 -1.0 \n",
+ "3 chr14 94036611 94036828 . 1000 . 220.16568 -1.0 \n",
+ "4 chr8 110060410 110060629 . 1000 . 219.42024 -1.0 \n",
+ "5 chr9 135967630 135967829 . 1000 . 218.66102 -1.0 \n",
+ "6 chr14 94136789 94136976 . 1000 . 218.30192 -1.0 \n",
+ "7 chr20 24918423 24918598 . 1000 . 218.92314 -1.0 \n",
+ "8 chr2 202178449 202178624 . 1000 . 217.36688 -1.0 \n",
+ "9 chr7 120987697 120987871 . 1000 . 217.34347 -1.0 \n",
"\n",
" q_value peak \n",
- "0 5.08726 131 \n",
- "1 5.08726 111 \n",
- "2 5.08726 110 \n",
- "3 5.08726 107 \n",
- "4 5.08726 103 \n",
- "5 5.08726 129 \n",
- "6 5.08726 138 \n",
- "7 5.08726 98 \n",
- "8 5.08726 125 \n",
- "9 5.08726 140 "
+ "0 5.08726 117 \n",
+ "1 5.08726 133 \n",
+ "2 5.08726 102 \n",
+ "3 5.08726 116 \n",
+ "4 5.08726 106 \n",
+ "5 5.08726 92 \n",
+ "6 5.08726 99 \n",
+ "7 5.08726 94 \n",
+ "8 5.08726 90 \n",
+ "9 5.08726 85 "
]
},
"execution_count": 4,
@@ -407,7 +407,7 @@
"giql_query = \"\"\"\n",
" SELECT DISTINCT a.*\n",
" FROM features_a a, features_b b\n",
- " WHERE a.position INTERSECTS b.position\n",
+ " WHERE a.interval INTERSECTS b.interval\n",
"\"\"\"\n",
"\n",
"# Transpile to SQL\n",
@@ -453,10 +453,12 @@
" a.end_pos,\n",
" a.signal_value,\n",
" COUNT(*) AS depth\n",
- " FROM features_b AS a\n",
- " INNER JOIN features_a AS b ON (a.\"chromosome\" = b.\"chromosome\"\n",
- " AND a.\"start_pos\" < b.\"end_pos\"\n",
- " AND a.\"end_pos\" > b.\"start_pos\")\n",
+ " FROM\n",
+ " features_a AS a,\n",
+ " features_b AS b\n",
+ " WHERE (a.\"chromosome\" = b.\"chromosome\"\n",
+ " AND a.\"start_pos\" < b.\"end_pos\"\n",
+ " AND a.\"end_pos\" > b.\"start_pos\")\n",
" GROUP BY\n",
" a.chromosome,\n",
" a.start_pos,\n",
@@ -464,13 +466,13 @@
" a.signal_value)\n",
"SELECT *\n",
"FROM overlap_counts\n",
- "WHERE depth >= 2\n",
+ "WHERE depth > 2\n",
"ORDER BY depth DESC\n",
"\n",
"================================================================================\n",
"\n",
- "Result: 109 intervals with overlap depth >= 5\n",
- "Max overlap depth: 2\n"
+ "Result: 2 intervals with overlap depth >= 5\n",
+ "Max overlap depth: 3\n"
]
},
{
@@ -504,83 +506,19 @@
" \n",
" \n",
" | 0 | \n",
- " chr2 | \n",
- " 219541562 | \n",
- " 219541786 | \n",
- " 66.90432 | \n",
- " 2 | \n",
+ " chr14 | \n",
+ " 105999349 | \n",
+ " 105999944 | \n",
+ " 240.51649 | \n",
+ " 3 | \n",
"
\n",
" \n",
" | 1 | \n",
- " chr17 | \n",
- " 45971661 | \n",
- " 45971971 | \n",
- " 18.47366 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chr15 | \n",
- " 76059040 | \n",
- " 76059350 | \n",
- " 3.48873 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chrX | \n",
- " 107272374 | \n",
- " 107272684 | \n",
- " 21.60152 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " chr6 | \n",
- " 132279569 | \n",
- " 132280035 | \n",
- " 151.44008 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " chr11 | \n",
- " 65932320 | \n",
- " 65932630 | \n",
- " 11.78241 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
" chr14 | \n",
- " 105999441 | \n",
- " 105999603 | \n",
- " 37.33685 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " chr2 | \n",
- " 170928770 | \n",
- " 170929192 | \n",
- " 167.70019 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " chr7 | \n",
- " 47475079 | \n",
- " 47475389 | \n",
- " 17.00878 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " chr2 | \n",
- " 176140701 | \n",
- " 176141011 | \n",
- " 5.54295 | \n",
- " 2 | \n",
+ " 105999349 | \n",
+ " 105999944 | \n",
+ " 108.06722 | \n",
+ " 3 | \n",
"
\n",
" \n",
"\n",
@@ -588,16 +526,8 @@
],
"text/plain": [
" chromosome start_pos end_pos signal_value depth\n",
- "0 chr2 219541562 219541786 66.90432 2\n",
- "1 chr17 45971661 45971971 18.47366 2\n",
- "2 chr15 76059040 76059350 3.48873 2\n",
- "3 chrX 107272374 107272684 21.60152 2\n",
- "4 chr6 132279569 132280035 151.44008 2\n",
- "5 chr11 65932320 65932630 11.78241 2\n",
- "6 chr14 105999441 105999603 37.33685 2\n",
- "7 chr2 170928770 170929192 167.70019 2\n",
- "8 chr7 47475079 47475389 17.00878 2\n",
- "9 chr2 176140701 176141011 5.54295 2"
+ "0 chr14 105999349 105999944 240.51649 3\n",
+ "1 chr14 105999349 105999944 108.06722 3"
]
},
"execution_count": 5,
@@ -615,13 +545,13 @@
" a.end_pos,\n",
" a.signal_value,\n",
" COUNT(*) as depth\n",
- " FROM features_b a\n",
- " INNER JOIN features_a b ON a.position INTERSECTS b.position\n",
+ " FROM features_a a, features_b b\n",
+ " WHERE a.interval INTERSECTS b.interval\n",
" GROUP BY a.chromosome, a.start_pos, a.end_pos, a.signal_value\n",
" )\n",
" SELECT *\n",
" FROM overlap_counts\n",
- " WHERE depth >= 2\n",
+ " WHERE depth > 2\n",
" ORDER BY depth DESC\n",
"\"\"\"\n",
"\n",
@@ -653,7 +583,7 @@
"\n",
"Find features in A that are **completely contained within** features in B.\n",
"\n",
- "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position WITHIN b.position`"
+ "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval WITHIN b.interval`"
]
},
{
@@ -666,7 +596,9 @@
"output_type": "stream",
"text": [
"Transpiled SQL:\n",
- "SELECT DISTINCT a.*\n",
+ "SELECT DISTINCT\n",
+ " a.*,\n",
+ " b.*\n",
"FROM\n",
" features_a AS a,\n",
" features_b AS b\n",
@@ -676,7 +608,7 @@
"\n",
"================================================================================\n",
"\n",
- "Result: 30806 intervals from A contained within B\n"
+ "Result: 30809 intervals from A contained within B\n"
]
},
{
@@ -710,138 +642,248 @@
" p_value | \n",
" q_value | \n",
" peak | \n",
+ " chromosome_1 | \n",
+ " start_pos_1 | \n",
+ " end_pos_1 | \n",
+ " name_1 | \n",
+ " score_1 | \n",
+ " strand_1 | \n",
+ " signal_value_1 | \n",
+ " p_value_1 | \n",
+ " q_value_1 | \n",
+ " peak_1 | \n",
" \n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " chr20 | \n",
- " 52937930 | \n",
- " 52938139 | \n",
+ " chr14 | \n",
+ " 24216011 | \n",
+ " 24216203 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 242.96853 | \n",
+ " 251.87452 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 110 | \n",
+ " 95 | \n",
+ " chr14 | \n",
+ " 24215977 | \n",
+ " 24216234 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 209.90753 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 125 | \n",
"
\n",
" \n",
" | 1 | \n",
- " chr16 | \n",
- " 81403612 | \n",
- " 81403816 | \n",
+ " chr14 | \n",
+ " 33792757 | \n",
+ " 33792965 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 244.89039 | \n",
+ " 252.08812 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 103 | \n",
+ " 105 | \n",
+ " chr14 | \n",
+ " 33792735 | \n",
+ " 33793013 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 232.27970 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 134 | \n",
"
\n",
" \n",
" | 2 | \n",
- " chr20 | \n",
- " 3250135 | \n",
- " 3250335 | \n",
+ " chr6 | \n",
+ " 43769540 | \n",
+ " 43769748 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 274.74729 | \n",
+ " 250.99653 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 95 | \n",
+ " 104 | \n",
+ " chr6 | \n",
+ " 43769499 | \n",
+ " 43769760 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 154.83893 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 141 | \n",
"
\n",
" \n",
" | 3 | \n",
- " chr9 | \n",
- " 92634770 | \n",
- " 92634990 | \n",
+ " chr17 | \n",
+ " 49861200 | \n",
+ " 49861405 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 313.09355 | \n",
+ " 250.16995 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 113 | \n",
+ " 93 | \n",
+ " chr17 | \n",
+ " 49861195 | \n",
+ " 49861409 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 78.03419 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 86 | \n",
"
\n",
" \n",
" | 4 | \n",
- " chr9 | \n",
- " 33722162 | \n",
- " 33722428 | \n",
+ " chr2 | \n",
+ " 219476345 | \n",
+ " 219476531 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 391.35401 | \n",
+ " 248.18364 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 132 | \n",
+ " 102 | \n",
+ " chr2 | \n",
+ " 219476292 | \n",
+ " 219476557 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 160.44163 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 125 | \n",
"
\n",
" \n",
" | 5 | \n",
- " chr8 | \n",
- " 22909069 | \n",
- " 22909276 | \n",
+ " chr11 | \n",
+ " 57527230 | \n",
+ " 57527419 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 245.13028 | \n",
+ " 247.38897 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 107 | \n",
+ " 104 | \n",
+ " chr11 | \n",
+ " 57527209 | \n",
+ " 57527456 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 121.89660 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 134 | \n",
"
\n",
" \n",
" | 6 | \n",
- " chr4 | \n",
- " 182925334 | \n",
- " 182925586 | \n",
+ " chr2 | \n",
+ " 152025859 | \n",
+ " 152026122 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 284.15090 | \n",
+ " 247.10786 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 135 | \n",
+ " 146 | \n",
+ " chr2 | \n",
+ " 152025676 | \n",
+ " 152026161 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 181.47229 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 304 | \n",
"
\n",
" \n",
" | 7 | \n",
- " chr17 | \n",
- " 51322469 | \n",
- " 51322723 | \n",
+ " chr2 | \n",
+ " 112259523 | \n",
+ " 112259722 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 383.45857 | \n",
+ " 249.77072 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 127 | \n",
+ " 95 | \n",
+ " chr2 | \n",
+ " 112259464 | \n",
+ " 112259743 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 219.16901 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 141 | \n",
"
\n",
" \n",
" | 8 | \n",
- " chr5 | \n",
- " 43627096 | \n",
- " 43627315 | \n",
+ " chr21 | \n",
+ " 43810840 | \n",
+ " 43811034 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 304.51285 | \n",
+ " 201.96817 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 115 | \n",
+ " 102 | \n",
+ " chr21 | \n",
+ " 43810814 | \n",
+ " 43811039 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 98.62279 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 131 | \n",
"
\n",
" \n",
" | 9 | \n",
- " chr11 | \n",
- " 2419724 | \n",
- " 2419926 | \n",
+ " chr17 | \n",
+ " 44331388 | \n",
+ " 44331569 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 247.25473 | \n",
+ " 200.30254 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 104 | \n",
+ " 100 | \n",
+ " chr17 | \n",
+ " 44331387 | \n",
+ " 44331617 | \n",
+ " . | \n",
+ " 1000 | \n",
+ " . | \n",
+ " 100.15563 | \n",
+ " -1.0 | \n",
+ " 5.10518 | \n",
+ " 116 | \n",
"
\n",
" \n",
"\n",
@@ -849,28 +891,40 @@
],
"text/plain": [
" chromosome start_pos end_pos name score strand signal_value p_value \\\n",
- "0 chr20 52937930 52938139 . 1000 . 242.96853 -1.0 \n",
- "1 chr16 81403612 81403816 . 1000 . 244.89039 -1.0 \n",
- "2 chr20 3250135 3250335 . 1000 . 274.74729 -1.0 \n",
- "3 chr9 92634770 92634990 . 1000 . 313.09355 -1.0 \n",
- "4 chr9 33722162 33722428 . 1000 . 391.35401 -1.0 \n",
- "5 chr8 22909069 22909276 . 1000 . 245.13028 -1.0 \n",
- "6 chr4 182925334 182925586 . 1000 . 284.15090 -1.0 \n",
- "7 chr17 51322469 51322723 . 1000 . 383.45857 -1.0 \n",
- "8 chr5 43627096 43627315 . 1000 . 304.51285 -1.0 \n",
- "9 chr11 2419724 2419926 . 1000 . 247.25473 -1.0 \n",
+ "0 chr14 24216011 24216203 . 1000 . 251.87452 -1.0 \n",
+ "1 chr14 33792757 33792965 . 1000 . 252.08812 -1.0 \n",
+ "2 chr6 43769540 43769748 . 1000 . 250.99653 -1.0 \n",
+ "3 chr17 49861200 49861405 . 1000 . 250.16995 -1.0 \n",
+ "4 chr2 219476345 219476531 . 1000 . 248.18364 -1.0 \n",
+ "5 chr11 57527230 57527419 . 1000 . 247.38897 -1.0 \n",
+ "6 chr2 152025859 152026122 . 1000 . 247.10786 -1.0 \n",
+ "7 chr2 112259523 112259722 . 1000 . 249.77072 -1.0 \n",
+ "8 chr21 43810840 43811034 . 1000 . 201.96817 -1.0 \n",
+ "9 chr17 44331388 44331569 . 1000 . 200.30254 -1.0 \n",
"\n",
- " q_value peak \n",
- "0 5.08726 110 \n",
- "1 5.08726 103 \n",
- "2 5.08726 95 \n",
- "3 5.08726 113 \n",
- "4 5.08726 132 \n",
- "5 5.08726 107 \n",
- "6 5.08726 135 \n",
- "7 5.08726 127 \n",
- "8 5.08726 115 \n",
- "9 5.08726 104 "
+ " q_value peak chromosome_1 start_pos_1 end_pos_1 name_1 score_1 \\\n",
+ "0 5.08726 95 chr14 24215977 24216234 . 1000 \n",
+ "1 5.08726 105 chr14 33792735 33793013 . 1000 \n",
+ "2 5.08726 104 chr6 43769499 43769760 . 1000 \n",
+ "3 5.08726 93 chr17 49861195 49861409 . 1000 \n",
+ "4 5.08726 102 chr2 219476292 219476557 . 1000 \n",
+ "5 5.08726 104 chr11 57527209 57527456 . 1000 \n",
+ "6 5.08726 146 chr2 152025676 152026161 . 1000 \n",
+ "7 5.08726 95 chr2 112259464 112259743 . 1000 \n",
+ "8 5.08726 102 chr21 43810814 43811039 . 1000 \n",
+ "9 5.08726 100 chr17 44331387 44331617 . 1000 \n",
+ "\n",
+ " strand_1 signal_value_1 p_value_1 q_value_1 peak_1 \n",
+ "0 . 209.90753 -1.0 5.10518 125 \n",
+ "1 . 232.27970 -1.0 5.10518 134 \n",
+ "2 . 154.83893 -1.0 5.10518 141 \n",
+ "3 . 78.03419 -1.0 5.10518 86 \n",
+ "4 . 160.44163 -1.0 5.10518 125 \n",
+ "5 . 121.89660 -1.0 5.10518 134 \n",
+ "6 . 181.47229 -1.0 5.10518 304 \n",
+ "7 . 219.16901 -1.0 5.10518 141 \n",
+ "8 . 98.62279 -1.0 5.10518 131 \n",
+ "9 . 100.15563 -1.0 5.10518 116 "
]
},
"execution_count": 6,
@@ -881,9 +935,9 @@
"source": [
"# Define GIQL query\n",
"giql_query = \"\"\"\n",
- " SELECT DISTINCT a.*\n",
+ " SELECT DISTINCT a.*, b.*\n",
" FROM features_a a, features_b b\n",
- " WHERE a.position WITHIN b.position\n",
+ " WHERE a.interval WITHIN b.interval\n",
"\"\"\"\n",
"\n",
"# Transpile to SQL\n",
@@ -913,7 +967,7 @@
"\n",
"Find features in A that **completely contain** features in B.\n",
"\n",
- "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position CONTAINS b.position`"
+ "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.interval CONTAINS b.interval`"
]
},
{
@@ -975,162 +1029,162 @@
" \n",
" \n",
" | 0 | \n",
- " chr13 | \n",
- " 48095092 | \n",
- " 48095340 | \n",
+ " chr4 | \n",
+ " 102431434 | \n",
+ " 102431659 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 376.88412 | \n",
+ " 299.13707 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 119 | \n",
+ " 111 | \n",
"
\n",
" \n",
" | 1 | \n",
- " chr20 | \n",
- " 12838346 | \n",
- " 12838555 | \n",
+ " chr14 | \n",
+ " 49586912 | \n",
+ " 49587154 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 275.52141 | \n",
+ " 303.26119 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 98 | \n",
+ " 126 | \n",
"
\n",
" \n",
" | 2 | \n",
- " chrX | \n",
- " 11533197 | \n",
- " 11533459 | \n",
+ " chr5 | \n",
+ " 134039564 | \n",
+ " 134039867 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 233.20172 | \n",
+ " 317.39854 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 102 | \n",
+ " 157 | \n",
"
\n",
" \n",
" | 3 | \n",
- " chr20 | \n",
- " 59222580 | \n",
- " 59222781 | \n",
+ " chr18 | \n",
+ " 77900138 | \n",
+ " 77900404 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 248.00196 | \n",
+ " 314.56958 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 108 | \n",
+ " 127 | \n",
"
\n",
" \n",
" | 4 | \n",
- " chr17 | \n",
- " 29180317 | \n",
- " 29180565 | \n",
+ " chrX | \n",
+ " 123960372 | \n",
+ " 123960620 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 323.60178 | \n",
+ " 291.07538 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 125 | \n",
+ " 123 | \n",
"
\n",
" \n",
" | 5 | \n",
- " chr20 | \n",
- " 39050143 | \n",
- " 39050364 | \n",
+ " chr1 | \n",
+ " 157311233 | \n",
+ " 157311386 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 286.34142 | \n",
+ " 188.57350 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 110 | \n",
+ " 80 | \n",
"
\n",
" \n",
" | 6 | \n",
- " chr17 | \n",
- " 35764130 | \n",
- " 35764337 | \n",
+ " chr21 | \n",
+ " 32403252 | \n",
+ " 32403390 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 262.41330 | \n",
+ " 132.27311 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 105 | \n",
+ " 80 | \n",
"
\n",
" \n",
" | 7 | \n",
- " chrX | \n",
- " 9995826 | \n",
- " 9996089 | \n",
+ " chr14 | \n",
+ " 104564131 | \n",
+ " 104564266 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 367.15975 | \n",
+ " 131.97784 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 126 | \n",
+ " 68 | \n",
"
\n",
" \n",
" | 8 | \n",
- " chrX | \n",
- " 12946826 | \n",
- " 12947077 | \n",
+ " chr10 | \n",
+ " 132204836 | \n",
+ " 132204945 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 371.63833 | \n",
+ " 114.77888 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 126 | \n",
+ " 59 | \n",
"
\n",
" \n",
" | 9 | \n",
- " chr17 | \n",
- " 29148927 | \n",
- " 29149149 | \n",
+ " chrX | \n",
+ " 54453024 | \n",
+ " 54453130 | \n",
" . | \n",
" 1000 | \n",
" . | \n",
- " 233.84271 | \n",
+ " 102.02848 | \n",
" -1.0 | \n",
" 5.08726 | \n",
- " 107 | \n",
+ " 65 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " chromosome start_pos end_pos name score strand signal_value p_value \\\n",
- "0 chr13 48095092 48095340 . 1000 . 376.88412 -1.0 \n",
- "1 chr20 12838346 12838555 . 1000 . 275.52141 -1.0 \n",
- "2 chrX 11533197 11533459 . 1000 . 233.20172 -1.0 \n",
- "3 chr20 59222580 59222781 . 1000 . 248.00196 -1.0 \n",
- "4 chr17 29180317 29180565 . 1000 . 323.60178 -1.0 \n",
- "5 chr20 39050143 39050364 . 1000 . 286.34142 -1.0 \n",
- "6 chr17 35764130 35764337 . 1000 . 262.41330 -1.0 \n",
- "7 chrX 9995826 9996089 . 1000 . 367.15975 -1.0 \n",
- "8 chrX 12946826 12947077 . 1000 . 371.63833 -1.0 \n",
- "9 chr17 29148927 29149149 . 1000 . 233.84271 -1.0 \n",
+ " chromosome start_pos end_pos name score strand signal_value p_value \\\n",
+ "0 chr4 102431434 102431659 . 1000 . 299.13707 -1.0 \n",
+ "1 chr14 49586912 49587154 . 1000 . 303.26119 -1.0 \n",
+ "2 chr5 134039564 134039867 . 1000 . 317.39854 -1.0 \n",
+ "3 chr18 77900138 77900404 . 1000 . 314.56958 -1.0 \n",
+ "4 chrX 123960372 123960620 . 1000 . 291.07538 -1.0 \n",
+ "5 chr1 157311233 157311386 . 1000 . 188.57350 -1.0 \n",
+ "6 chr21 32403252 32403390 . 1000 . 132.27311 -1.0 \n",
+ "7 chr14 104564131 104564266 . 1000 . 131.97784 -1.0 \n",
+ "8 chr10 132204836 132204945 . 1000 . 114.77888 -1.0 \n",
+ "9 chrX 54453024 54453130 . 1000 . 102.02848 -1.0 \n",
"\n",
" q_value peak \n",
- "0 5.08726 119 \n",
- "1 5.08726 98 \n",
- "2 5.08726 102 \n",
- "3 5.08726 108 \n",
- "4 5.08726 125 \n",
- "5 5.08726 110 \n",
- "6 5.08726 105 \n",
- "7 5.08726 126 \n",
- "8 5.08726 126 \n",
- "9 5.08726 107 "
+ "0 5.08726 111 \n",
+ "1 5.08726 126 \n",
+ "2 5.08726 157 \n",
+ "3 5.08726 127 \n",
+ "4 5.08726 123 \n",
+ "5 5.08726 80 \n",
+ "6 5.08726 80 \n",
+ "7 5.08726 68 \n",
+ "8 5.08726 59 \n",
+ "9 5.08726 65 "
]
},
"execution_count": 7,
@@ -1143,7 +1197,7 @@
"giql_query = \"\"\"\n",
" SELECT DISTINCT a.*\n",
" FROM features_a a, features_b b\n",
- " WHERE a.position CONTAINS b.position\n",
+ " WHERE a.interval CONTAINS b.interval\n",
"\"\"\"\n",
"\n",
"# Transpile to SQL\n",
@@ -1175,7 +1229,7 @@
"\n",
"Similar to `bedtools merge`, this collapses overlapping genomic intervals.\n",
"\n",
- "**GIQL Query**: `SELECT MERGE(position) FROM features_a`"
+ "**GIQL Query**: `SELECT MERGE(interval) FROM features_a`"
]
},
{
@@ -1331,7 +1385,7 @@
"source": [
"# Define GIQL query - basic merge\n",
"giql_query = \"\"\"\n",
- " SELECT MERGE(position)\n",
+ " SELECT MERGE(interval)\n",
" FROM features_b\n",
"\"\"\"\n",
"\n",
@@ -1403,7 +1457,13 @@
"WHERE interval_count > 1\n",
"\n",
"================================================================================\n",
- "\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"Result: 559 merged intervals with more than 1 original interval\n"
]
},
@@ -1544,7 +1604,7 @@
"giql_query = \"\"\"\n",
" WITH merged_intervals AS (\n",
" SELECT\n",
- " MERGE(position),\n",
+ " MERGE(interval),\n",
" COUNT(*) as interval_count,\n",
" AVG(signal_value) as avg_signal\n",
" FROM features_b\n",
@@ -1734,7 +1794,7 @@
"source": [
"# Define GIQL query - merge with distance parameter\n",
"giql_query = \"\"\"\n",
- " SELECT MERGE(position, 10000)\n",
+ " SELECT MERGE(interval, 10000)\n",
" FROM features_b\n",
"\"\"\"\n",
"\n",
@@ -1768,7 +1828,7 @@
"\n",
"Similar to `bedtools cluster`, this groups overlapping genomic intervals.\n",
"\n",
- "**GIQL Query**: `SELECT *, CLUSTER(position) AS cluster_id FROM features_a`"
+ "**GIQL Query**: `SELECT *, CLUSTER(interval) AS cluster_id FROM features_a`"
]
},
{
@@ -1950,7 +2010,7 @@
" start_pos,\n",
" end_pos,\n",
" signal_value,\n",
- " CLUSTER(position) AS cluster_id\n",
+ " CLUSTER(interval) AS cluster_id\n",
" FROM features_a\n",
" ORDER BY chromosome, start_pos\n",
"\"\"\"\n",
@@ -2007,7 +2067,7 @@
" signal_value,\n",
" CASE\n",
" WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n",
- " ORDER BY \"start_pos\" NULLS LAST) + 10000 >= \"start_pos\" THEN 0\n",
+ " ORDER BY \"start_pos\" NULLS LAST) + 50000 >= \"start_pos\" THEN 0\n",
" ELSE 1\n",
" END AS is_new_cluster\n",
" FROM features_a) AS lag_calc\n",
@@ -2018,7 +2078,7 @@
"================================================================================\n",
"\n",
"Result: 60893 intervals with cluster assignments (1000bp distance)\n",
- "Number of unique clusters: 3692\n"
+ "Number of unique clusters: 1376\n"
]
},
{
@@ -2096,7 +2156,7 @@
" 778812 | \n",
" 779008 | \n",
" 56.97663 | \n",
- " 5.0 | \n",
+ " 4.0 | \n",
" \n",
" \n",
" | 6 | \n",
@@ -2104,7 +2164,7 @@
" 850473 | \n",
" 850669 | \n",
" 27.08243 | \n",
- " 6.0 | \n",
+ " 5.0 | \n",
"
\n",
" \n",
" | 7 | \n",
@@ -2112,7 +2172,7 @@
" 858056 | \n",
" 858252 | \n",
" 15.55869 | \n",
- " 6.0 | \n",
+ " 5.0 | \n",
"
\n",
" \n",
" | 8 | \n",
@@ -2120,7 +2180,7 @@
" 869860 | \n",
" 869991 | \n",
" 168.87294 | \n",
- " 7.0 | \n",
+ " 5.0 | \n",
"
\n",
" \n",
" | 9 | \n",
@@ -2128,7 +2188,7 @@
" 904689 | \n",
" 904883 | \n",
" 167.56897 | \n",
- " 8.0 | \n",
+ " 5.0 | \n",
"
\n",
" \n",
"\n",
@@ -2141,11 +2201,11 @@
"2 chr1 267909 268105 88.00863 2.0\n",
"3 chr1 586106 586302 35.02027 3.0\n",
"4 chr1 729261 729457 23.29415 4.0\n",
- "5 chr1 778812 779008 56.97663 5.0\n",
- "6 chr1 850473 850669 27.08243 6.0\n",
- "7 chr1 858056 858252 15.55869 6.0\n",
- "8 chr1 869860 869991 168.87294 7.0\n",
- "9 chr1 904689 904883 167.56897 8.0"
+ "5 chr1 778812 779008 56.97663 4.0\n",
+ "6 chr1 850473 850669 27.08243 5.0\n",
+ "7 chr1 858056 858252 15.55869 5.0\n",
+ "8 chr1 869860 869991 168.87294 5.0\n",
+ "9 chr1 904689 904883 167.56897 5.0"
]
},
"execution_count": 12,
@@ -2161,7 +2221,7 @@
" start_pos,\n",
" end_pos,\n",
" signal_value,\n",
- " CLUSTER(position, 10000) AS cluster_id\n",
+ " CLUSTER(interval, 50000) AS cluster_id\n",
" FROM features_a\n",
" ORDER BY chromosome, start_pos\n",
"\"\"\"\n",
@@ -2196,7 +2256,7 @@
"\n",
"Similar to `bedtools closest -d`, this calculates the distance in base pairs between genomic intervals.\n",
"\n",
- "**GIQL Query**: `SELECT a.*, b.*, DISTANCE(a.position, b.position) AS distance FROM features_a a, features_b b`"
+ "**GIQL Query**: `SELECT a.*, b.*, DISTANCE(a.interval, b.interval) AS distance FROM features_a a, features_b b`"
]
},
{
@@ -2351,10 +2411,10 @@
" a.end_pos AS a_end,\n",
" b.start_pos AS b_start,\n",
" b.end_pos AS b_end,\n",
- " DISTANCE(a.position, b.position, signed=true) AS signed_distance,\n",
+ " DISTANCE(a.interval, b.interval, signed=true) AS signed_distance,\n",
" ROW_NUMBER() OVER (\n",
" PARTITION BY a.chromosome, a.start_pos, a.end_pos\n",
- " ORDER BY ABS(DISTANCE(a.position, b.position, signed=true))\n",
+ " ORDER BY ABS(DISTANCE(a.interval, b.interval, signed=true))\n",
" ) AS rank\n",
" FROM features_a a\n",
" CROSS JOIN features_b b\n",
@@ -2431,8 +2491,9 @@
" WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n",
" ELSE (a.\"start_pos\" - b.\"end_pos\")\n",
" END AS distance\n",
- "FROM features_a AS a\n",
- "CROSS JOIN features_b AS b\n",
+ "FROM\n",
+ " features_a AS a,\n",
+ " features_b AS b\n",
"WHERE a.chromosome = b.chromosome\n",
" AND a.chromosome = 'chr1'\n",
" AND a.start_pos BETWEEN 180000 AND 190000\n",
@@ -2522,9 +2583,8 @@
" a.end_pos AS a_end,\n",
" b.start_pos AS b_start,\n",
" b.end_pos AS b_end,\n",
- " DISTANCE(a.position, b.position) AS distance\n",
- " FROM features_a a\n",
- " CROSS JOIN features_b b\n",
+ " DISTANCE(a.interval, b.interval) AS distance\n",
+ " FROM features_a a, features_b b\n",
" WHERE a.chromosome = b.chromosome \n",
" AND a.chromosome = 'chr1'\n",
" AND a.start_pos BETWEEN 180000 AND 190000\n",
@@ -2567,7 +2627,7 @@
"- Directional queries with `signed=true` (upstream vs downstream)\n",
"- Distance-constrained queries with `max_distance`\n",
"- Strand-specific queries with `stranded=true`\n",
- "- Implicit reference resolution in LATERAL joins (automatically uses outer table's position)\n",
+ "- Implicit reference resolution in LATERAL joins (automatically uses outer table's interval)\n",
"\n",
"**Database Support**:\n",
"- **DuckDB**: Full support (LATERAL joins)\n",
@@ -2798,7 +2858,7 @@
"source": [
"# Define GIQL query - find closest feature in B for each feature in A\n",
"# This replicates bedtools closest -d functionality using NEAREST(target, k=1)\n",
- "# The reference position is automatically inferred from the outer table (features_a)\n",
+ "# The reference interval is automatically inferred from the outer table (features_a)\n",
"giql_query = \"\"\"\n",
" SELECT \n",
" a.chromosome,\n",
@@ -3148,6 +3208,7 @@
" LIMIT 3) AS b\n",
"WHERE a.chromosome = 'chr1'\n",
" AND a.start_pos < 500000\n",
+ " AND direction = 'upstream'\n",
"ORDER BY\n",
" a.chromosome,\n",
" a.start_pos,\n",
@@ -3155,10 +3216,10 @@
"\n",
"================================================================================\n",
"\n",
- "Result: 9 total nearest features\n",
+ "Result: 0 total nearest features\n",
"Upstream features (distance < 0): 0\n",
- "Downstream features (distance > 0): 7\n",
- "Overlapping features (distance = 0): 2\n"
+ "Downstream features (distance > 0): 0\n",
+ "Overlapping features (distance = 0): 0\n"
]
},
{
@@ -3192,111 +3253,14 @@
" \n",
" \n",
" \n",
- " \n",
- " | 0 | \n",
- " chr1 | \n",
- " 181368 | \n",
- " 181564 | \n",
- " 186654 | \n",
- " 186964 | \n",
- " 5090 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chr1 | \n",
- " 181368 | \n",
- " 181564 | \n",
- " 267979 | \n",
- " 268128 | \n",
- " 86415 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chr1 | \n",
- " 181368 | \n",
- " 181564 | \n",
- " 850424 | \n",
- " 850734 | \n",
- " 668860 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chr1 | \n",
- " 186650 | \n",
- " 186846 | \n",
- " 186654 | \n",
- " 186964 | \n",
- " 0 | \n",
- " overlap | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " chr1 | \n",
- " 186650 | \n",
- " 186846 | \n",
- " 267979 | \n",
- " 268128 | \n",
- " 81133 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " chr1 | \n",
- " 186650 | \n",
- " 186846 | \n",
- " 850424 | \n",
- " 850734 | \n",
- " 663578 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " chr1 | \n",
- " 267909 | \n",
- " 268105 | \n",
- " 267979 | \n",
- " 268128 | \n",
- " 0 | \n",
- " overlap | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " chr1 | \n",
- " 267909 | \n",
- " 268105 | \n",
- " 186654 | \n",
- " 186964 | \n",
- " 80945 | \n",
- " downstream | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " chr1 | \n",
- " 267909 | \n",
- " 268105 | \n",
- " 850424 | \n",
- " 850734 | \n",
- " 582319 | \n",
- " downstream | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " chromosome a_start a_end b_start b_end distance direction\n",
- "0 chr1 181368 181564 186654 186964 5090 downstream\n",
- "1 chr1 181368 181564 267979 268128 86415 downstream\n",
- "2 chr1 181368 181564 850424 850734 668860 downstream\n",
- "3 chr1 186650 186846 186654 186964 0 overlap\n",
- "4 chr1 186650 186846 267979 268128 81133 downstream\n",
- "5 chr1 186650 186846 850424 850734 663578 downstream\n",
- "6 chr1 267909 268105 267979 268128 0 overlap\n",
- "7 chr1 267909 268105 186654 186964 80945 downstream\n",
- "8 chr1 267909 268105 850424 850734 582319 downstream"
+ "Empty DataFrame\n",
+ "Columns: [chromosome, a_start, a_end, b_start, b_end, distance, direction]\n",
+ "Index: []"
]
},
"execution_count": 17,
@@ -3323,6 +3287,7 @@
" FROM features_a a, NEAREST(features_b, k=3, signed=true) b\n",
" WHERE a.chromosome = 'chr1'\n",
" AND a.start_pos < 500000\n",
+ " AND direction = 'upstream'\n",
" ORDER BY a.chromosome, a.start_pos, ABS(distance)\n",
"\"\"\"\n",
"\n",
@@ -3365,7 +3330,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -3696,6 +3661,69 @@
"# Clean up\n",
"# engine.close()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "9q2y97xvfyj",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pandas dtypes:\n",
+ "chromosome str\n",
+ "start_pos int64\n",
+ "end_pos int64\n",
+ "name str\n",
+ "score int64\n",
+ "strand str\n",
+ "signal_value float64\n",
+ "p_value float64\n",
+ "q_value float64\n",
+ "peak int64\n",
+ "dtype: object\n",
+ "\n",
+ "First few rows:\n",
+ " chromosome start_pos end_pos name score strand signal_value p_value \\\n",
+ "0 chr17 27381032 27381306 . 1000 . 461.10586 -1.0 \n",
+ "1 chr7 25565932 25566199 . 1000 . 457.60096 -1.0 \n",
+ "\n",
+ " q_value peak \n",
+ "0 5.08726 136 \n",
+ "1 5.08726 133 \n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "# Load first BED file to check dtypes\n",
+ "bed_schema = {\n",
+ " \"chromosome\": pl.Utf8,\n",
+ " \"start_pos\": pl.Int64,\n",
+ " \"end_pos\": pl.Int64,\n",
+ " \"name\": pl.Utf8,\n",
+ " \"score\": pl.Int64,\n",
+ " \"strand\": pl.Utf8,\n",
+ " \"signal_value\": pl.Float64,\n",
+ " \"p_value\": pl.Float64,\n",
+ " \"q_value\": pl.Float64,\n",
+ " \"peak\": pl.Int64,\n",
+ "}\n",
+ "\n",
+ "features_a = pl.read_csv(\n",
+ " \".ENCFF199YFA.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n",
+ ")\n",
+ "\n",
+ "# Check pandas conversion\n",
+ "adf = features_a.to_pandas()\n",
+ "print(\"Pandas dtypes:\")\n",
+ "print(adf.dtypes)\n",
+ "print(\"\\nFirst few rows:\")\n",
+ "print(adf.head(2))"
+ ]
}
],
"metadata": {
diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py
index 8cf0923..3cb446e 100644
--- a/src/giql/generators/base.py
+++ b/src/giql/generators/base.py
@@ -151,6 +151,9 @@ def giqlnearest_sql(self, expression: GIQLNearest) -> str:
stranded = expression.args.get("stranded")
is_stranded = stranded and str(stranded).lower() in ("true", "1")
+ signed = expression.args.get("signed")
+ is_signed = signed and str(signed).lower() in ("true", "1")
+
# Resolve strand columns if stranded mode
ref_strand = None
target_strand = None
@@ -227,6 +230,7 @@ def giqlnearest_sql(self, expression: GIQLNearest) -> str:
target_strand,
stranded=is_stranded,
add_one_for_gap=add_one,
+ signed=is_signed,
)
# Use absolute distance for ordering and filtering
@@ -305,6 +309,17 @@ def giqldistance_sql(self, expression: GIQLDistance) -> str:
else:
stranded = str(stranded_expr).upper() in ("TRUE", "1", "YES")
+ # Extract signed parameter
+ signed_expr = expression.args.get("signed")
+ signed = False
+ if signed_expr:
+ if isinstance(signed_expr, exp.Boolean):
+ signed = signed_expr.this
+ elif isinstance(signed_expr, exp.Literal):
+ signed = str(signed_expr.this).upper() == "TRUE"
+ else:
+ signed = str(signed_expr).upper() in ("TRUE", "1", "YES")
+
# Get SQL representations
interval_a_sql = self.sql(interval_a)
interval_b_sql = self.sql(interval_b)
@@ -374,6 +389,7 @@ def giqldistance_sql(self, expression: GIQLDistance) -> str:
strand_b,
stranded=stranded,
add_one_for_gap=add_one,
+ signed=signed,
)
def _generate_distance_case(
@@ -388,6 +404,7 @@ def _generate_distance_case(
strand_b: str | None,
stranded: bool = False,
add_one_for_gap: bool = False,
+ signed: bool = False,
) -> str:
"""Generate SQL CASE expression for distance calculation.
@@ -401,6 +418,7 @@ def _generate_distance_case(
:param strand_b: Strand column for interval B (None if not stranded)
:param stranded: Whether to use strand-aware distance calculation
:param add_one_for_gap: Whether to add 1 to non-overlapping distance (bedtools compatibility)
+ :param signed: Whether to return signed distance (negative for upstream, positive for downstream)
:return: SQL CASE expression
"""
# Distance adjustment for non-overlapping intervals
@@ -408,12 +426,55 @@ def _generate_distance_case(
if not stranded or strand_a is None or strand_b is None:
# Basic distance calculation without strand awareness
- return f"""CASE WHEN {chrom_a} != {chrom_b} THEN NULL WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 WHEN {end_a} <= {start_b} THEN ({start_b} - {end_a}{gap_adj}) ELSE ({start_a} - {end_b}{gap_adj}) END"""
+ if signed:
+ # Signed distance: negative for upstream (B before A),
+ # positive for downstream (B after A)
+ return (
+ f"CASE WHEN {chrom_a} != {chrom_b} THEN NULL "
+ f"WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 "
+ f"WHEN {end_a} <= {start_b} THEN ({start_b} - {end_a}{gap_adj}) "
+ f"ELSE -({start_a} - {end_b}{gap_adj}) END"
+ )
+ # Unsigned (absolute) distance
+ return (
+ f"CASE WHEN {chrom_a} != {chrom_b} THEN NULL "
+ f"WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 "
+ f"WHEN {end_a} <= {start_b} THEN ({start_b} - {end_a}{gap_adj}) "
+ f"ELSE ({start_a} - {end_b}{gap_adj}) END"
+ )
# Stranded distance calculation
# Return NULL if either strand is '.', '?', or NULL
# Calculate distance and multiply by -1 if first interval is on '-' strand
- return f"""CASE WHEN {chrom_a} != {chrom_b} THEN NULL WHEN {strand_a} IS NULL OR {strand_b} IS NULL THEN NULL WHEN {strand_a} = '.' OR {strand_a} = '?' THEN NULL WHEN {strand_b} = '.' OR {strand_b} = '?' THEN NULL WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 WHEN {end_a} <= {start_b} THEN CASE WHEN {strand_a} = '-' THEN -({start_b} - {end_a}{gap_adj}) ELSE ({start_b} - {end_a}{gap_adj}) END ELSE CASE WHEN {strand_a} = '-' THEN -({start_a} - {end_b}{gap_adj}) ELSE ({start_a} - {end_b}{gap_adj}) END END"""
+ if signed:
+ # Stranded + signed: apply strand flip AND directional sign
+ return (
+ f"CASE WHEN {chrom_a} != {chrom_b} THEN NULL "
+ f"WHEN {strand_a} IS NULL OR {strand_b} IS NULL THEN NULL "
+ f"WHEN {strand_a} = '.' OR {strand_a} = '?' THEN NULL "
+ f"WHEN {strand_b} = '.' OR {strand_b} = '?' THEN NULL "
+ f"WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 "
+ f"WHEN {end_a} <= {start_b} THEN "
+ f"CASE WHEN {strand_a} = '-' THEN -({start_b} - {end_a}{gap_adj}) "
+ f"ELSE ({start_b} - {end_a}{gap_adj}) END "
+ f"ELSE "
+ f"CASE WHEN {strand_a} = '-' THEN ({start_a} - {end_b}{gap_adj}) "
+ f"ELSE -({start_a} - {end_b}{gap_adj}) END END"
+ )
+ # Stranded but not signed: apply strand flip only
+ return (
+ f"CASE WHEN {chrom_a} != {chrom_b} THEN NULL "
+ f"WHEN {strand_a} IS NULL OR {strand_b} IS NULL THEN NULL "
+ f"WHEN {strand_a} = '.' OR {strand_a} = '?' THEN NULL "
+ f"WHEN {strand_b} = '.' OR {strand_b} = '?' THEN NULL "
+ f"WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 "
+ f"WHEN {end_a} <= {start_b} THEN "
+ f"CASE WHEN {strand_a} = '-' THEN -({start_b} - {end_a}{gap_adj}) "
+ f"ELSE ({start_b} - {end_a}{gap_adj}) END "
+ f"ELSE "
+ f"CASE WHEN {strand_a} = '-' THEN -({start_a} - {end_b}{gap_adj}) "
+ f"ELSE ({start_a} - {end_b}{gap_adj}) END END"
+ )
def _generate_spatial_op(self, expression: exp.Binary, op_type: str) -> str:
"""Generate SQL for a spatial operation.
diff --git a/tests/integration/bedtools/test_nearest.py b/tests/integration/bedtools/test_nearest.py
index a185993..c45775a 100644
--- a/tests/integration/bedtools/test_nearest.py
+++ b/tests/integration/bedtools/test_nearest.py
@@ -265,3 +265,202 @@ def test_nearest_boundary_cases(duckdb_connection):
f"Differences: {comparison.differences}\n"
f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}"
)
+
+
+def test_nearest_signed_distance(duckdb_connection):
+ """Test NEAREST with signed=true for directional distance.
+
+ Given:
+ Intervals in A with an upstream neighbor in B
+ When:
+ NEAREST operator is applied with signed=true
+ Then:
+ Distance is negative for upstream B intervals (B ends before A starts)
+ This matches bedtools closest -D ref behavior
+ """
+ # Arrange: a1 has an upstream neighbor (b1)
+ # a1 at [300-400], b1 at [100-200] (upstream, distance = -(300-200+1) = -101)
+ intervals_a = [
+ GenomicInterval("chr1", 300, 400, "a1", 100, "+"),
+ ]
+ intervals_b = [
+ GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream of a1
+ ]
+
+ # Load into DuckDB
+ load_intervals(
+ duckdb_connection,
+ "intervals_a",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
+ )
+ load_intervals(
+ duckdb_connection,
+ "intervals_b",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
+ )
+
+ # Act: Execute bedtools operation with signed distance (-D ref)
+ bedtools_result = closest(
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
+ signed=True,
+ )
+
+ # Act: Execute GIQL query with signed=true
+ engine = _setup_giql_engine(duckdb_connection)
+ giql_query = """
+ SELECT
+ a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand,
+ b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand,
+ distance
+ FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b
+ ORDER BY a.chromosome, a.start_pos
+ """
+ sql = engine.transpile(giql_query)
+ giql_result = duckdb_connection.execute(sql).fetchall()
+
+ # Assert: Both should return 1 row
+ assert len(giql_result) == len(bedtools_result) == 1
+
+ giql_distance = giql_result[0][12]
+ bedtools_distance = bedtools_result[0][12]
+
+ # Verify the distance is negative (upstream)
+ assert giql_distance < 0, f"Expected negative distance, got {giql_distance}"
+ assert bedtools_distance < 0, f"Expected negative bedtools distance, got {bedtools_distance}"
+
+ # Verify distances match
+ assert giql_distance == bedtools_distance, (
+ f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}"
+ )
+
+
+def test_nearest_signed_distance_upstream_only(duckdb_connection):
+ """Test NEAREST with signed=true filtering for upstream features only.
+
+ Given:
+ Intervals in A with neighbors in B, using signed=true
+ When:
+ Filtering for negative distance (upstream features)
+ Then:
+ Only upstream B intervals are returned (distance < 0)
+ """
+ # Arrange
+ # a1 at [500-600]
+ # b1 at [100-200]: upstream, distance = -(500 - 200 + 1) = -301 (closed interval +1)
+ # b2 at [300-400]: upstream, distance = -(500 - 400 + 1) = -101 (closed interval +1)
+ # b3 at [700-800]: downstream, distance = +(700 - 600 + 1) = +101 (closed interval +1)
+ intervals_a = [
+ GenomicInterval("chr1", 500, 600, "a1", 100, "+"),
+ ]
+ intervals_b = [
+ GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream
+ GenomicInterval("chr1", 300, 400, "b2", 150, "+"), # Upstream
+ GenomicInterval("chr1", 700, 800, "b3", 200, "+"), # Downstream
+ ]
+
+ # Load into DuckDB
+ load_intervals(
+ duckdb_connection,
+ "intervals_a",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
+ )
+ load_intervals(
+ duckdb_connection,
+ "intervals_b",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
+ )
+
+ # Act: Execute GIQL query filtering for upstream only (negative distance)
+ engine = _setup_giql_engine(duckdb_connection)
+ giql_query = """
+ SELECT
+ a.name AS a_name,
+ b.name AS b_name,
+ distance
+ FROM intervals_a a, NEAREST(intervals_b, k=3, signed=true) b
+ WHERE distance < 0
+ ORDER BY distance DESC
+ """
+ sql = engine.transpile(giql_query)
+ giql_result = duckdb_connection.execute(sql).fetchall()
+
+ # Assert: Should only return upstream intervals (b1 and b2)
+ assert len(giql_result) == 2
+ # All distances should be negative
+ for row in giql_result:
+ assert row[2] < 0, f"Expected negative distance, got {row[2]}"
+ # b2 should be first (closer upstream, distance -101 with closed interval +1)
+ assert giql_result[0][1] == "b2"
+ assert giql_result[0][2] == -101
+ # b1 should be second (farther upstream, distance -301 with closed interval +1)
+ assert giql_result[1][1] == "b1"
+ assert giql_result[1][2] == -301
+
+
+def test_nearest_signed_distance_downstream(duckdb_connection):
+ """Test NEAREST with signed=true for downstream features.
+
+ Given:
+ Intervals in A with a downstream neighbor in B
+ When:
+ NEAREST operator is applied with signed=true
+ Then:
+ Distance is positive for downstream B intervals (B starts after A ends)
+ This matches bedtools closest -D ref behavior
+ """
+ # Arrange: a1 has a downstream neighbor (b1)
+ # a1 at [100-200], b1 at [300-400] (downstream, distance = 300-200+1 = 101)
+ intervals_a = [
+ GenomicInterval("chr1", 100, 200, "a1", 100, "+"),
+ ]
+ intervals_b = [
+ GenomicInterval("chr1", 300, 400, "b1", 100, "+"), # Downstream of a1
+ ]
+
+ # Load into DuckDB
+ load_intervals(
+ duckdb_connection,
+ "intervals_a",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
+ )
+ load_intervals(
+ duckdb_connection,
+ "intervals_b",
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
+ )
+
+ # Act: Execute bedtools operation with signed distance (-D ref)
+ bedtools_result = closest(
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a],
+ [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b],
+ signed=True,
+ )
+
+ # Act: Execute GIQL query with signed=true
+ engine = _setup_giql_engine(duckdb_connection)
+ giql_query = """
+ SELECT
+ a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand,
+ b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand,
+ distance
+ FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b
+ ORDER BY a.chromosome, a.start_pos
+ """
+ sql = engine.transpile(giql_query)
+ giql_result = duckdb_connection.execute(sql).fetchall()
+
+ # Assert: Both should return 1 row
+ assert len(giql_result) == len(bedtools_result) == 1
+
+ giql_distance = giql_result[0][12]
+ bedtools_distance = bedtools_result[0][12]
+
+ # Verify the distance is positive (downstream)
+ assert giql_distance > 0, f"Expected positive distance, got {giql_distance}"
+ assert bedtools_distance > 0, "Expected positive bedtools distance"
+
+ # Verify distances match
+ assert giql_distance == bedtools_distance, (
+ f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}"
+ )
diff --git a/tests/integration/bedtools/utils/bedtools_wrapper.py b/tests/integration/bedtools/utils/bedtools_wrapper.py
index 699185b..21d201f 100644
--- a/tests/integration/bedtools/utils/bedtools_wrapper.py
+++ b/tests/integration/bedtools/utils/bedtools_wrapper.py
@@ -146,6 +146,7 @@ def closest(
intervals_b: List[Tuple],
strand_mode: str | None = None,
k: int = 1,
+ signed: bool = False,
) -> List[Tuple]:
"""Find closest intervals using bedtools closest.
@@ -154,6 +155,8 @@ def closest(
intervals_b: Database intervals to search
strand_mode: Strand requirement ('same', 'opposite', or None for ignore)
k: Number of closest intervals to report (default: 1)
+ signed: If True, return signed distances (negative for upstream B,
+ positive for downstream B). Uses bedtools -D ref mode.
Returns:
List of tuples with format: (a_fields..., b_fields..., distance)
@@ -173,7 +176,14 @@ def closest(
bt_b = bt_b.sort()
# Build kwargs for closest
- kwargs = {"d": True, "t": "first"} # Report distance, break ties by taking first
+ # -d reports unsigned distance, -D ref reports signed distance
+ if signed:
+ # Use -D ref for signed distance relative to reference (A)
+ # Negative = B is upstream of A, Positive = B is downstream of A
+ kwargs = {"D": "ref", "t": "first"}
+ else:
+ kwargs = {"d": True, "t": "first"}
+
if k > 1:
kwargs["k"] = k
if strand_mode == "same":
diff --git a/tests/test_distance_transpilation.py b/tests/test_distance_transpilation.py
index 7b1d79e..77405cc 100644
--- a/tests/test_distance_transpilation.py
+++ b/tests/test_distance_transpilation.py
@@ -70,3 +70,31 @@ def test_distance_transpilation_postgres(self):
expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b"""
assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}"
+
+ def test_distance_transpilation_signed_duckdb(self):
+ """
+ GIVEN a GIQL query with DISTANCE(..., signed=true)
+ WHEN transpiling to DuckDB SQL
+ THEN should generate CASE expression with signed distances
+ (negative for upstream, positive for downstream)
+ """
+ sql = """
+ SELECT DISTANCE(a.interval, b.interval, signed=true) as dist
+ FROM features_a a CROSS JOIN features_b b
+ """
+
+ ast = parse_one(sql, dialect=GIQLDialect)
+ generator = GIQLDuckDBGenerator()
+ output = generator.generate(ast)
+
+ # Signed distance: upstream (B before A) returns negative,
+ # downstream (B after A) returns positive
+ expected = (
+ 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL '
+ 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 '
+ 'WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") '
+ 'ELSE -(a."start_pos" - b."end_pos") END AS dist '
+ "FROM features_a AS a CROSS JOIN features_b AS b"
+ )
+
+ assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}"
diff --git a/tests/test_nearest_transpilation.py b/tests/test_nearest_transpilation.py
index 0ff8abb..91618b6 100644
--- a/tests/test_nearest_transpilation.py
+++ b/tests/test_nearest_transpilation.py
@@ -155,6 +155,35 @@ def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes):
assert "strand" in output.lower()
assert "LIMIT 3" in output
+ def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes):
+ """
+ GIVEN a GIQL query with NEAREST(genes, k=3, signed=true)
+ WHEN transpiling to DuckDB SQL
+ THEN should generate SQL with signed distance column
+ (negative for upstream, positive for downstream)
+ """
+ sql = """
+ SELECT *
+ FROM peaks
+ CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3, signed=true)
+ """
+
+ ast = parse_one(sql, dialect=GIQLDialect)
+ generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes)
+ output = generator.generate(ast)
+
+ # Expectations:
+ # - LATERAL subquery
+ # - Signed distance calculation (includes negation for upstream)
+ # - LIMIT 3
+ assert "LATERAL" in output.upper()
+ assert "LIMIT 3" in output
+ # Check for signed distance: the ELSE branch should have a negation
+ # for upstream features (B before A)
+ assert "ELSE -(" in output, (
+ f"Expected signed distance with negation for upstream, got:\n{output}"
+ )
+
# PostgreSQL uses same generator as base for now
# class TestNearestTranspilationPostgreSQL: