From 887bc85c140156574a98bc9711dab14fb8f1da91 Mon Sep 17 00:00:00 2001 From: conradbzura <41407350+conradbzura@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:32:53 -0500 Subject: [PATCH 01/12] Implement genomic interval SQL operators * Initialize project * Implement binary predicate operators and genomic interval notation parser * Rename OVERLAPS to INTERSECTS to align with bedtools and eliminate conflict with existing OVERLAPS keyword * Implement expected schema and dialect parser * Implement GIQL to SQL transpiler * Implement GIQL query engine * Make all imports absolute * Fix bug preventing GIQL operators from being used alongside standard SQL operators and in CTEs * Add bedtools intersect-like recipes to readme * Add documentation * Implement CLUSTER and MERGE UDFs * Quote customizable field names to avoid collisions with protected SQL keywords * Fully implement custom schema support * Implement "giql intersect" CLI command mirroring "bedtools intersect" API and functionality * Replace GIQLEngine.query() with new a execute() method that returns a PEP 249-like cursor * Add transpile() method to GIQLEngine class and update docs * Update README to include installation instructions * Fix bug in MERGE and CLUSTER queries that incorporate CTEs; Add demo notebook with various usage examples * Drafted GIQL's governing project constitution * Created feature specification for "DISTANCE" operator * Implement DISTANCE UDF for caluculating the distance(s) between intervals * Update demo.ipynb with DISTANCE examples * Update DISTANCE operator to remove "signed" parameter and change behavior of "stranded" parameter when intervals are on opposite strands * Create feature specification for "NEAREST" operator * Implement NEAREST operator * Fix formatting * Implement integration test suite evaluating GIQL behavior against bedtools (#5) * Update core principle I Co-authored-by: Nezar Abdennur * Update rules for core principle I Co-authored-by: Nezar Abdennur * Update core principle II Co-authored-by: Nezar Abdennur * Update core principle III Co-authored-by: Nezar Abdennur * Update rationale for core principle III Co-authored-by: Nezar Abdennur * Clean up speckit artifacts we don't want to track * Refactor documentation * Changed default genomic column name from "position" to "interval" * Update readme * Implement CICD * Update project authors and maintainers * Implement build hook for dynamic version resolution from git tag * Update readme * Add hypothesis to dev dependencies * Add pybedtools to dev dependencies * Update test workflow to use pixi * Update test workflow to install GIQL in editable mode * Lower code coverage target for now * Describe tags of head branch when tagging new version in publish-release workflow * Separate bump and tag version jobs in publish-release workflow --------- Co-authored-by: Nezar Abdennur --- .coveragerc | 9 + .github/actions/add-label/action.yaml | 22 + .github/actions/build-release/action.yaml | 31 + .github/actions/get-touched-files/action.yaml | 31 + .../publish-github-release/action.yaml | 42 + .../actions/publish-pypi-release/action.yaml | 27 + .github/scripts/bump-version.sh | 108 + .github/scripts/cut-release.sh | 76 + .github/scripts/install-tools.sh | 74 + .github/scripts/publish-distribution.sh | 72 + .github/scripts/set-secrets.sh | 34 + .github/scripts/split-version.sh | 32 + .github/workflows/cut-release.yaml | 120 + .github/workflows/label-pr.yaml | 49 + .github/workflows/publish-release.yaml | 124 + .github/workflows/run-tests.yaml | 29 + .github/workflows/sync-branches.yaml | 80 + .github/workflows/validate-pr.yaml | 26 + .github/workflows/validate-repo.yaml | 22 + .gitignore | 66 + README.md | 154 + build-hooks/__init__.py | 0 build-hooks/_git.py | 37 + build-hooks/_version.py | 730 ++++ build-hooks/metadata.py | 24 + demo.ipynb | 3722 +++++++++++++++++ docs/.gitignore | 5 + docs/Makefile | 20 + docs/api/index.rst | 12 + docs/conf.py | 73 + docs/guides/index.rst | 32 + docs/guides/multi-backend.rst | 367 ++ docs/guides/performance.rst | 414 ++ docs/guides/schema-mapping.rst | 445 ++ docs/guides/transpilation.rst | 417 ++ docs/index.rst | 147 + docs/make.bat | 35 + docs/operators/aggregation-operators.rst | 402 ++ docs/operators/distance-operators.rst | 393 ++ docs/operators/index.rst | 114 + docs/operators/quantifiers.rst | 332 ++ docs/operators/spatial-operators.rst | 361 ++ docs/quickstart.rst | 228 + docs/recipes/advanced-queries.rst | 449 ++ docs/recipes/bedtools-migration.rst | 695 +++ docs/recipes/clustering-queries.rst | 450 ++ docs/recipes/distance-queries.rst | 376 ++ docs/recipes/index.rst | 77 + docs/recipes/intersect-queries.rst | 379 ++ docs/reference/changelog.rst | 87 + docs/reference/operator-matrix.rst | 199 + docs/reference/syntax-reference.rst | 329 ++ docs/requirements.txt | 8 + pyproject.toml | 91 + src/giql/__init__.py | 52 + src/giql/cli.py | 683 +++ src/giql/constants.py | 11 + src/giql/dialect.py | 130 + src/giql/engine.py | 370 ++ src/giql/expressions.py | 276 ++ src/giql/generators/__init__.py | 9 + src/giql/generators/base.py | 871 ++++ src/giql/generators/duckdb.py | 22 + src/giql/generators/sqlite.py | 25 + src/giql/protocols.py | 81 + src/giql/range_parser.py | 188 + src/giql/schema.py | 83 + src/giql/transformer.py | 582 +++ tests/__init__.py | 0 tests/conftest.py | 180 + tests/integration/bedtools/__init__.py | 5 + tests/integration/bedtools/conftest.py | 46 + tests/integration/bedtools/test_intersect.py | 313 ++ tests/integration/bedtools/test_merge.py | 224 + tests/integration/bedtools/test_nearest.py | 267 ++ .../integration/bedtools/test_strand_aware.py | 471 +++ tests/integration/bedtools/utils/__init__.py | 1 + .../integration/bedtools/utils/bed_export.py | 40 + .../bedtools/utils/bedtools_wrapper.py | 293 ++ .../integration/bedtools/utils/comparison.py | 134 + .../integration/bedtools/utils/data_models.py | 259 ++ .../bedtools/utils/interval_generator.py | 425 ++ tests/test_cluster.py | 441 ++ tests/test_distance_parsing.py | 60 + tests/test_distance_transpilation.py | 72 + tests/test_distance_udf.py | 394 ++ tests/test_engine.py | 480 +++ tests/test_generator.py | 165 + tests/test_nearest_edge_cases.py | 633 +++ tests/test_nearest_parsing.py | 198 + tests/test_nearest_transpilation.py | 267 ++ tests/test_parser.py | 124 + tests/test_range_parser.py | 109 + 93 files changed, 21662 insertions(+) create mode 100644 .coveragerc create mode 100644 .github/actions/add-label/action.yaml create mode 100644 .github/actions/build-release/action.yaml create mode 100644 .github/actions/get-touched-files/action.yaml create mode 100644 .github/actions/publish-github-release/action.yaml create mode 100644 .github/actions/publish-pypi-release/action.yaml create mode 100755 .github/scripts/bump-version.sh create mode 100755 .github/scripts/cut-release.sh create mode 100755 .github/scripts/install-tools.sh create mode 100755 .github/scripts/publish-distribution.sh create mode 100755 .github/scripts/set-secrets.sh create mode 100755 .github/scripts/split-version.sh create mode 100644 .github/workflows/cut-release.yaml create mode 100644 .github/workflows/label-pr.yaml create mode 100644 .github/workflows/publish-release.yaml create mode 100644 .github/workflows/run-tests.yaml create mode 100644 .github/workflows/sync-branches.yaml create mode 100644 .github/workflows/validate-pr.yaml create mode 100644 .github/workflows/validate-repo.yaml create mode 100644 .gitignore create mode 100644 README.md create mode 100644 build-hooks/__init__.py create mode 100644 build-hooks/_git.py create mode 100644 build-hooks/_version.py create mode 100644 build-hooks/metadata.py create mode 100644 demo.ipynb create mode 100644 docs/.gitignore create mode 100644 docs/Makefile create mode 100644 docs/api/index.rst create mode 100644 docs/conf.py create mode 100644 docs/guides/index.rst create mode 100644 docs/guides/multi-backend.rst create mode 100644 docs/guides/performance.rst create mode 100644 docs/guides/schema-mapping.rst create mode 100644 docs/guides/transpilation.rst create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 docs/operators/aggregation-operators.rst create mode 100644 docs/operators/distance-operators.rst create mode 100644 docs/operators/index.rst create mode 100644 docs/operators/quantifiers.rst create mode 100644 docs/operators/spatial-operators.rst create mode 100644 docs/quickstart.rst create mode 100644 docs/recipes/advanced-queries.rst create mode 100644 docs/recipes/bedtools-migration.rst create mode 100644 docs/recipes/clustering-queries.rst create mode 100644 docs/recipes/distance-queries.rst create mode 100644 docs/recipes/index.rst create mode 100644 docs/recipes/intersect-queries.rst create mode 100644 docs/reference/changelog.rst create mode 100644 docs/reference/operator-matrix.rst create mode 100644 docs/reference/syntax-reference.rst create mode 100644 docs/requirements.txt create mode 100644 pyproject.toml create mode 100644 src/giql/__init__.py create mode 100644 src/giql/cli.py create mode 100644 src/giql/constants.py create mode 100644 src/giql/dialect.py create mode 100644 src/giql/engine.py create mode 100644 src/giql/expressions.py create mode 100644 src/giql/generators/__init__.py create mode 100644 src/giql/generators/base.py create mode 100644 src/giql/generators/duckdb.py create mode 100644 src/giql/generators/sqlite.py create mode 100644 src/giql/protocols.py create mode 100644 src/giql/range_parser.py create mode 100644 src/giql/schema.py create mode 100644 src/giql/transformer.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/bedtools/__init__.py create mode 100644 tests/integration/bedtools/conftest.py create mode 100644 tests/integration/bedtools/test_intersect.py create mode 100644 tests/integration/bedtools/test_merge.py create mode 100644 tests/integration/bedtools/test_nearest.py create mode 100644 tests/integration/bedtools/test_strand_aware.py create mode 100644 tests/integration/bedtools/utils/__init__.py create mode 100644 tests/integration/bedtools/utils/bed_export.py create mode 100644 tests/integration/bedtools/utils/bedtools_wrapper.py create mode 100644 tests/integration/bedtools/utils/comparison.py create mode 100644 tests/integration/bedtools/utils/data_models.py create mode 100644 tests/integration/bedtools/utils/interval_generator.py create mode 100644 tests/test_cluster.py create mode 100644 tests/test_distance_parsing.py create mode 100644 tests/test_distance_transpilation.py create mode 100644 tests/test_distance_udf.py create mode 100644 tests/test_engine.py create mode 100644 tests/test_generator.py create mode 100644 tests/test_nearest_edge_cases.py create mode 100644 tests/test_nearest_parsing.py create mode 100644 tests/test_nearest_transpilation.py create mode 100644 tests/test_parser.py create mode 100644 tests/test_range_parser.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..e68a7a0 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,9 @@ +[run] +source = src/giql +omit = + src/giql/__init__.py + +[report] +show_missing = true +precision = 2 +fail_under = 73.40 diff --git a/.github/actions/add-label/action.yaml b/.github/actions/add-label/action.yaml new file mode 100644 index 0000000..aa4c02f --- /dev/null +++ b/.github/actions/add-label/action.yaml @@ -0,0 +1,22 @@ +name: Add label +description: Add specified label to pull request. +inputs: + label: + description: Label to add to the pull request. + required: true + type: string + +runs: + using: composite + steps: + - name: Verify event + if: ${{ github.event_name != 'pull_request' }} + shell: bash + run: | + echo "Error: This action can only be used on pull request events." + exit 1 + + - name: Add label + shell: bash + run: | + gh pr edit "${{ github.event.number }}" --add-label "${{ inputs.label }}" diff --git a/.github/actions/build-release/action.yaml b/.github/actions/build-release/action.yaml new file mode 100644 index 0000000..287532f --- /dev/null +++ b/.github/actions/build-release/action.yaml @@ -0,0 +1,31 @@ +name: Build release +description: Build distribution for the target version. +inputs: + version: + required: true + type: string + +runs: + using: composite + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-tags: true + persist-credentials: false + ref: ${{ inputs.version }} + + - name: Install uv & prepare python + uses: astral-sh/setup-uv@v5 + with: + enable-cache: false + + - name: Build distribution artifacts + shell: bash + run: uv build --out-dir dist-${{ inputs.version }} + + - name: Store distribution artifacts + uses: actions/upload-artifact@v4 + with: + name: dist-${{ inputs.version }} + path: dist-${{ inputs.version }}/ diff --git a/.github/actions/get-touched-files/action.yaml b/.github/actions/get-touched-files/action.yaml new file mode 100644 index 0000000..d4a65c7 --- /dev/null +++ b/.github/actions/get-touched-files/action.yaml @@ -0,0 +1,31 @@ +name: Get touched files +description: Get a list of files that have been modified in a pull request. +inputs: + pathspec: + description: 'Optional pathspec(s) to filter files. E.g., "src/**" will only get files in the src directory.' + required: false + type: string +outputs: + touched: + description: 'List of files that have been modified in a pull request.' + value: ${{ steps.get-touched-files.outputs.touched }} + +runs: + using: composite + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Get touched files + id: get-touched-files + shell: bash + run: | + eval "pathspec=('${{ inputs.pathspec }}')" + echo "pathspec: '${pathspec[@]}'" + touched=$(git diff --name-only HEAD origin/master -- ${pathspec[@]}) + touched=$(echo "$touched" | tr '\n' ' ' | xargs) + echo "touched: '$touched'" + echo "touched=$touched" >> $GITHUB_OUTPUT diff --git a/.github/actions/publish-github-release/action.yaml b/.github/actions/publish-github-release/action.yaml new file mode 100644 index 0000000..7cdadcb --- /dev/null +++ b/.github/actions/publish-github-release/action.yaml @@ -0,0 +1,42 @@ +name: Publish release to GitHub +description: Publish the target version to GitHub. + +inputs: + version: + required: true + type: string + +runs: + using: composite + steps: + - name: Download distribution artifacts + uses: actions/download-artifact@v4 + with: + name: dist-${{ inputs.version }} + path: dist-${{ inputs.version }}/ + + - name: Sign the artifacts with Sigstore + uses: sigstore/gh-action-sigstore-python@v3.0.0 + with: + inputs: ./dist-${{ inputs.version }}/*.tar.gz ./dist-${{ inputs.version }}/*.whl + + - name: Create GitHub release + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: | + if [[ "${{ inputs.version }}" == *rc* ]]; then + prerelease_flag="--prerelease" + else + prerelease_flag="" + fi + gh release create ${{ inputs.version }} --verify-tag \ + --repo ${{ github.repository }} \ + --generate-notes \ + $prerelease_flag + + - name: Upload artifact signatures to GitHub release + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: gh release upload ${{ inputs.version }} dist-${{ inputs.version }}/** --repo ${{ github.repository }} diff --git a/.github/actions/publish-pypi-release/action.yaml b/.github/actions/publish-pypi-release/action.yaml new file mode 100644 index 0000000..bd2414a --- /dev/null +++ b/.github/actions/publish-pypi-release/action.yaml @@ -0,0 +1,27 @@ +name: Publish release to PyPI +description: Publish the target version to PyPI. + +inputs: + version: + required: true + type: string + pypi-token: + required: true + type: string + +runs: + using: composite + steps: + - name: Download distribution artifacts + uses: actions/download-artifact@v4 + with: + name: dist-${{ inputs.version }} + path: dist-${{ inputs.version }}/ + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Upload distribution to PyPI + shell: bash + run: | + .github/scripts/publish-distribution.sh --source dist-${{ inputs.version }} ${{ inputs.pypi-token }} diff --git a/.github/scripts/bump-version.sh b/.github/scripts/bump-version.sh new file mode 100755 index 0000000..8e146c1 --- /dev/null +++ b/.github/scripts/bump-version.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +USAGE="Usage: $0 major|minor|patch VERSION [short|full=short]" + +# Evaluate arguments +case $# in + 3) + case $3 in + short) + FULL=false + ;; + full) + FULL=true + ;; + *) + FULL=false + ;; + esac + ;; + 2) + FULL=false + ;; + *) + echo $USAGE + exit 1 + ;; +esac + +# Evaluate version segment +case $1 in + major|minor|patch) + SEGMENT=$1 + ;; + *) + echo "ERROR: Invalid version segment: $1" >&2 + echo $USAGE + exit 1 + ;; +esac + +# Determine release cycle +VERSION=$2 +case $VERSION in + *a*) + CYCLE="a" + PRE_RELEASE=true + ;; + *b*) + CYCLE="b" + PRE_RELEASE=true + ;; + *rc*) + CYCLE="rc" + PRE_RELEASE=true + ;; + *) + CYCLE="." + PRE_RELEASE=false + ;; +esac + +if [ "$PRE_RELEASE" = true ] && [[ "$SEGMENT" == "major" ]]; then + echo "ERROR: Cannot bump major version segment of a pre-release version" >&2 + exit 1 +fi + +#Split version +read MAJOR MINOR PATCH <<< $(.github/scripts/split-version.sh $VERSION) + +case $SEGMENT in + major) + MAJOR=$((MAJOR + 1)) + MINOR=0 + PATCH=0 + ;; + minor) + case $CYCLE in + ".") + MINOR=$((MINOR + 1)) + ;; + "a") + MINOR=$((MINOR)) + CYCLE="b" + ;; + "b") + MINOR=$((MINOR)) + CYCLE="rc" + ;; + "rc") + MINOR=$((MINOR)) + CYCLE="." + ;; + esac + PATCH=0 + ;; + patch) + if [ -z "$PATCH" ]; then + PATCH=0 + fi + PATCH=$((PATCH + 1)) + ;; +esac + +if [ "$CYCLE" == "." ] && [ "$PATCH" -eq 0 ] && [ "$FULL" == false ]; then + echo "v$MAJOR.$MINOR" +else + echo "v$MAJOR.$MINOR$CYCLE$PATCH" +fi diff --git a/.github/scripts/cut-release.sh b/.github/scripts/cut-release.sh new file mode 100755 index 0000000..f547cc8 --- /dev/null +++ b/.github/scripts/cut-release.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +USAGE="Usage: $0 major|minor [BRANCH=release]" + +# Evaluate arguments +case $1 in + major|minor) + RELEASE_TYPE=$1 + ;; + *) + echo "ERROR: Invalid release type: $1" >&2 + echo $USAGE + exit 1 + ;; +esac +case $# in + 1) + BRANCH="release" + ;; + 2) + BRANCH=$2 + ;; + *) + echo $USAGE + exit 1 + ;; +esac + +git fetch --unshallow >/dev/null 2>&1 +git checkout main >/dev/null 2>&1 +git pull >/dev/null 2>&1 + +# Check if the release branch already exists +if git show-ref --verify --quiet refs/heads/$BRANCH; then + echo "ERROR: Branch '$BRANCH' already exists." >&2 + exit 1 +fi + +# Get the latest version tag, default to 0.0.0 +VERSION=$(git describe --tags --abbrev=0) + +# Verify no active release candidates exist +if [[ $VERSION == *rc* ]]; then + echo "ERROR: An active release candidate already exists: $VERSION" >&2 + exit 1 +fi + +read MAJOR MINOR PATCH <<< $(.github/scripts/split-version.sh $VERSION) + +# Bump the version +case $RELEASE_TYPE in + major) + RELEASE_VERSION="$((MAJOR + 1)).0rc0" + ;; + minor) + RELEASE_VERSION="${MAJOR}.$((MINOR + 1))rc0" + ;; +esac + +RELEASE_TAG="v$RELEASE_VERSION" + +# Create a new branch for the release candidate +OUTPUT=$(git checkout -b $BRANCH >/dev/null 2>&1) +if [ $? -ne 0 ]; then + echo "ERROR: Failed to create branch '$BRANCH'." >&2 + echo "$OUTPUT" >&2 + exit 1 +fi +OUTPUT=$(git push origin $BRANCH 2>&1) +if [ $? -ne 0 ]; then + echo "ERROR: Failed to push branch '$BRANCH' to origin." >&2 + echo "$OUTPUT" >&2 + exit 1 +fi + +echo $RELEASE_TAG diff --git a/.github/scripts/install-tools.sh b/.github/scripts/install-tools.sh new file mode 100755 index 0000000..900a73f --- /dev/null +++ b/.github/scripts/install-tools.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Check if Homebrew (brew) is installed +if ! command -v brew &> /dev/null; then + + # Prompt the user to install Homebrew (defaults to "yes") + read -p "Homebrew (brew) is not installed. Do you want to install it now? (Y/n): " brew_choice + brew_choice=${brew_choice:-Y} + if [[ "$brew_choice" == "y" || "$brew_choice" == "Y" ]]; then + + # Install Homebrew using the official installation script + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + + # Confirm the installation + if ! command -v brew &> /dev/null; then + echo "Error: Homebrew installation failed." + exit 1 + fi + else + echo "Homebrew is required. Please install it manually." + exit 1 + fi +else + echo "Homebrew is already installed." +fi + +# Check if the GitHub CLI is installed +if ! command -v gh &> /dev/null; then + + # Prompt the user to install GitHub CLI (defaults to "yes") + read -p "GitHub CLI (gh) is required but not installed. Do you want to install it now? (Y/n): " gh_choice + gh_choice=${gh_choice:-Y} + if [[ "$gh_choice" == "y" || "$gh_choice" == "Y" ]]; then + + # Install GitHub CLI using Homebrew + brew install gh + + # Confirm the installation + if ! command -v gh &> /dev/null; then + echo "Error: GitHub CLI installation failed." + exit 1 + fi + else + echo "GitHub CLI is required. Please install it manually." + exit 1 + fi +else + echo "GitHub CLI is already installed." +fi + +# Check if the keychain secrets manager (ks) is installed +if ! command -v ks &> /dev/null; then + + # Prompt the user to install ks (defaults to "yes") + read -p "Keychain secrets manager is required but not installed. Do you want to install it now? (Y/n): " ks_choice + ks_choice=${ks_choice:-Y} + if [[ "$ks_choice" == "y" || "$ks_choice" == "Y" ]]; then + + # Install keychain secrets manager using Homebrew + brew tap loteoo/formulas + brew install ks + + # Confirm the installation + if ! command -v ks &> /dev/null; then + echo "Error: Keychain secrets manager installation failed." + exit 1 + fi + else + echo "Keychain secrets manager is required. Please install it manually." + exit 1 + fi +else + echo "Keychain secrets manager is already installed." +fi diff --git a/.github/scripts/publish-distribution.sh b/.github/scripts/publish-distribution.sh new file mode 100755 index 0000000..cad7710 --- /dev/null +++ b/.github/scripts/publish-distribution.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +USAGE="Usage: $0 [-s|--source=dist] [[KEYCHAIN=dev SECRET=pypi-token] | [TOKEN]]" +SOURCE="dist" +KEYCHAIN="dev" +SECRET="pypi-token" + +# Parse options +ARGS=() +while [[ "$#" -gt 0 ]]; do + case $1 in + -s|--source) + # Set source directory + if [[ -z "$2" ]]; then + echo "Error: --source requires a value." + echo $USAGE + exit 1 + fi + SOURCE="$2" + shift + ;; + *) + # Collect arguments + ARGS+=("$1") + ;; + esac + shift +done +echo "Publishing artifacts in '$SOURCE' directory..." + +# Parse arguments +case ${#ARGS[@]} in + 0) + # Attempt to retrieve token from default keychain + if command -v ks &> /dev/null; then + if [[ -n $(ks -k $KEYCHAIN ls | grep "\b$SECRET\b") ]]; then + echo "Publishing with token from keychain..." + TOKEN=$(ks -k $KEYCHAIN show $SECRET) + else + echo "Warning: Keychain does not contain 'pypi-token' secret." + echo "Publishing without token..." + fi + else + echo "Warning: Keychain secrets manager (ks) is not installed. Please install it to use keychain secrets." + echo "Publishing without token..." + fi + ;; + 1) + # Use the specified token + echo "Publishing with provided token..." + TOKEN="${ARGS[0]}" + ;; + 2) + # Attempt to retrieve token from the specified keychain + echo "Publishing with token from keychain..." + KEYCHAIN="${ARGS[0]}" + SECRET="${ARGS[1]}" + TOKEN=$(ks -k $KEYCHAIN show $SECRET) + ;; + *) + # Improper usage + echo $USAGE + exit 1 + ;; +esac + +# Publish the package +if [ -n "$TOKEN" ]; then + uv publish --username "__token__" --password "$TOKEN" "$SOURCE"/* +else + uv publish "$SOURCE/*" +fi diff --git a/.github/scripts/set-secrets.sh b/.github/scripts/set-secrets.sh new file mode 100755 index 0000000..2dbdbe1 --- /dev/null +++ b/.github/scripts/set-secrets.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +USAGE="Usage: $0 [-k|--keychain ] [-l|--logout]" + +LOGOUT=false +while [[ "$#" -gt 0 ]]; do + case $1 in + -l|--logout) LOGOUT=true;; + -k|--keychain) KEYCHAIN="$2"; shift ;; + *) echo $USAGE; exit 1 ;; + esac + shift +done + +if [[ ! $(gh auth status) ]]; then + gh auth login +fi +for KEY in "pypi-token" "my-token"; do + if [[ -n "$KEYCHAIN" && -n $(ks -k $KEYCHAIN ls | grep "\b$KEY\b") ]]; then + echo "Using $KEY from keychain" + SECRET=$(ks -k $KEYCHAIN show $KEY) + else + read -sp "Enter a value for $KEY: " SECRET + if [[ -z "$SECRET" ]]; then + echo "" + echo "Error: A value for $KEY is required." + exit 1 + fi + fi + gh secret set $(echo $KEY | tr '-' '_' | tr '[:lower:]' '[:upper:]') --app actions --body $SECRET +done +if [[ "$LOGOUT" == true ]]; then + gh auth logout +fi diff --git a/.github/scripts/split-version.sh b/.github/scripts/split-version.sh new file mode 100755 index 0000000..a69a46b --- /dev/null +++ b/.github/scripts/split-version.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +USAGE="Usage: $0 [VERSION=0.0.0]" + +# Evaluate arguments +case $# in + 1) + VERSION=$1 + ;; + *) + VERSION="0.0.0" + ;; +esac + +# Strip the leading "v" if there is one +if [[ $VERSION == v* ]]; then + VERSION=${VERSION#v} +fi + +# Replace pre-release identifiers with a dot +if [[ $VERSION == *a* ]]; then + VERSION=${VERSION//a/.} +elif [[ $VERSION == *b* ]]; then + VERSION=${VERSION//b/.} +elif [[ $VERSION == *rc* ]]; then + VERSION=${VERSION//rc/.} +fi + +# Split the tag into its components +IFS='.' read -r -a VERSION_PARTS <<< "$VERSION" + +echo "${VERSION_PARTS[0]} ${VERSION_PARTS[1]} ${VERSION_PARTS[2]}" diff --git a/.github/workflows/cut-release.yaml b/.github/workflows/cut-release.yaml new file mode 100644 index 0000000..e140f1d --- /dev/null +++ b/.github/workflows/cut-release.yaml @@ -0,0 +1,120 @@ +name: Cut release +description: Cut a release branch and publish a release-candidate. + +on: + workflow_dispatch: + inputs: + release-type: + type: choice + options: + - major + - minor + default: minor + +jobs: + verify-code-changes: + name: Verify code has changed + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: main + - name: Get touched files + id: get-touched-files + uses: ./.github/actions/get-touched-files + with: + pathspec: 'src/ pyproject.toml' + - name: Fail if code hasn't changed + if: ${{ ! steps.get-touched-files.outputs.touched }} + run: | + echo "ERROR: No code changes detected." + exit 1 + + cut-release: + name: Cut ${{ github.event.inputs.release-type }} release + needs: verify-code-changes + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + outputs: + version: ${{ steps.cut-release-branch.outputs.version }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.MY_TOKEN }} + - name: Verify main is up to date + run: | + git fetch origin main + git fetch origin master + if [[ $(git rev-list --count --left-only origin/master..origin/main) -gt 0 ]]; then + echo "Error: main is not up to date with master" >&2 + exit 1 + fi + - name: Create release branch + id: cut-release-branch + run: | + version=$(.github/scripts/cut-release.sh ${{ github.event.inputs.release-type }} release) + echo "Release candidate: $version" + echo "version=$version" >> $GITHUB_OUTPUT + - name: Checkout release + uses: actions/checkout@v4 + with: + ref: release + token: ${{ secrets.MY_TOKEN }} + - name: Tag version + run: | + git tag ${{ steps.cut-release-branch.outputs.version }} + git push origin ${{ steps.cut-release-branch.outputs.version }} + - name: Create pull request + env: + GH_TOKEN: ${{ secrets.MY_TOKEN }} + run: | + workflow=$(echo '${{ github.workflow }}' | tr '[:upper:]' '[:lower:]') + body='Auto-generated by the ['$workflow'](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) workflow.' + version=$(.github/scripts/bump-version.sh minor ${{ steps.cut-release-branch.outputs.version }}) + gh pr create -B master -H release --title 'Release '$version'' --body "$body" + + build-release: + name: Build release + needs: cut-release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/build-release + with: + version: ${{ needs.cut-release.outputs.version }} + + publish-github-release: + name: Publish release to GitHub + needs: + - cut-release + - build-release + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/publish-github-release + with: + version: ${{ needs.cut-release.outputs.version }} + + publish-pypi-release: + name: Publish release to PyPI + needs: + - cut-release + - build-release + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/publish-pypi-release + with: + version: ${{ needs.cut-release.outputs.version }} + pypi-token: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/label-pr.yaml b/.github/workflows/label-pr.yaml new file mode 100644 index 0000000..0c2e7d5 --- /dev/null +++ b/.github/workflows/label-pr.yaml @@ -0,0 +1,49 @@ +name: Label pull request + +on: + pull_request: + branches: + - master + - main + - release + types: + - opened + - reopened + paths: + - src/** + - pyproject.toml + +jobs: + add-code-change-label: + name: Add code-change label + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + if: ${{ github.head_ref != 'release' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Add label + env: + GH_TOKEN: ${{ secrets.MY_TOKEN }} + uses: ./.github/actions/add-label + with: + label: code-change + + add-release-label: + name: Add release label + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + if: ${{ github.base_ref == 'master' && github.head_ref == 'release' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Add label + env: + GH_TOKEN: ${{ secrets.MY_TOKEN }} + uses: ./.github/actions/add-label + with: + label: release diff --git a/.github/workflows/publish-release.yaml b/.github/workflows/publish-release.yaml new file mode 100644 index 0000000..478898a --- /dev/null +++ b/.github/workflows/publish-release.yaml @@ -0,0 +1,124 @@ +name: Publish release +description: Tag, build, and publish a release or release-candidate on PR merge. + +on: + pull_request: + branches: + - master + - release + types: + - closed + paths: + - src/** + - pyproject.toml + +jobs: + bump-version: + name: Bump version + runs-on: ubuntu-latest + if: ${{ github.event.pull_request.merged == true }} + permissions: + contents: write + outputs: + version: ${{ steps.bump-version.outputs.version }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.MY_TOKEN }} + ref: ${{ github.head_ref }} + - name: Determine version segment to bump + id: determine-version-segment + run: | + case ${{ github.base_ref }} in + master) + case ${{ github.head_ref }} in + release) + echo "segment=minor" >> $GITHUB_OUTPUT + ;; + *) + echo "segment=patch" >> $GITHUB_OUTPUT + ;; + esac + ;; + release) + echo "segment=patch" >> $GITHUB_OUTPUT + ;; + *) + echo "Error: Unsupported base branch ${{ github.base_ref }}" >&2 + exit 1 + ;; + esac + - name: Bump version + id: bump-version + run: | + git fetch --unshallow + old_version=$(git describe --tags --abbrev=0) + new_version=$(.github/scripts/bump-version.sh ${{ steps.determine-version-segment.outputs.segment }} $old_version) + echo "Bumping $old_version to $new_version" + echo "version=$new_version" >> $GITHUB_OUTPUT + + tag-version: + name: Tag version + needs: bump-version + runs-on: ubuntu-latest + if: ${{ github.event.pull_request.merged == true }} + permissions: + contents: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.MY_TOKEN }} + - name: Create tag + id: create-tag + run: | + new_version=${{ needs.bump-version.outputs.version }} + echo "Tagging $new_version" + git tag $new_version + git push origin $new_version + echo "version=$new_version" >> $GITHUB_OUTPUT + + build-release: + name: Build release + needs: + - bump-version + - tag-version + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/build-release + with: + version: ${{ needs.bump-version.outputs.version }} + + publish-github-release: + name: Publish release to GitHub + needs: + - bump-version + - tag-version + - build-release + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/publish-github-release + with: + version: ${{ needs.bump-version.outputs.version }} + + publish-pypi-release: + name: Publish release to PyPI + needs: + - bump-version + - tag-version + - build-release + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/publish-pypi-release + with: + version: ${{ needs.bump-version.outputs.version }} + pypi-token: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml new file mode 100644 index 0000000..32d9bc8 --- /dev/null +++ b/.github/workflows/run-tests.yaml @@ -0,0 +1,29 @@ +name: Run tests + +on: + pull_request: + branches: + - main + - master + - release + +jobs: + run-tests: + name: Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - '3.11' + - '3.12' + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Install pixi + uses: prefix-dev/setup-pixi@v0.8.1 + - name: Install pixi dependencies + run: pixi install + - name: Install package in development mode + run: pixi run pip install -e . + - name: Run tests + run: pixi run pytest tests diff --git a/.github/workflows/sync-branches.yaml b/.github/workflows/sync-branches.yaml new file mode 100644 index 0000000..53d4adf --- /dev/null +++ b/.github/workflows/sync-branches.yaml @@ -0,0 +1,80 @@ +name: Sync branches +description: Sync main, master, and release branches on PR merge. + +on: + pull_request: + branches: + - master + - release + types: + - closed + +jobs: + sync-main: + name: Sync main + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.MY_TOKEN }} + - name: Create pull request + env: + GH_TOKEN: ${{ secrets.MY_TOKEN }} + run: | + workflow=$(echo '${{ github.workflow }}' | tr '[:upper:]' '[:lower:]') + body='Auto-generated by the ['$workflow'](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) workflow.' + url=$(gh pr create -B main -H ${{ github.base_ref }} --title 'Sync main with ${{ github.base_ref }}' --body "$body") + gh pr merge $url --auto --merge + # gh pr review $url --approve + + sync-release: + name: Sync release + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true && github.head_ref != 'release' && github.base_ref == 'master' + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.MY_TOKEN }} + - name: Create pull request + env: + GH_TOKEN: ${{ secrets.MY_TOKEN }} + run: | + git fetch --unshallow + if [[ -n $(git ls-remote --heads origin release) ]]; then + workflow=$(echo '${{ github.workflow }}' | tr '[:upper:]' '[:lower:]') + body='Auto-generated by the ['$workflow'](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) workflow.' + url=$(gh pr create -B release -H ${{ github.base_ref }} --title 'Sync release with ${{ github.base_ref }}' --body "$body") + gh pr merge $url --auto --merge + # gh pr review $url --approve + else + echo "Release branch not found, skipping." + fi + + drop-release: + name: Drop release + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true && github.head_ref == 'release' && github.base_ref == 'master' + permissions: + contents: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.MY_TOKEN }} + - name: Drop release branch + run: | + git fetch --unshallow + if [[ -n $(git ls-remote --heads origin release) ]]; then + git push origin --delete release + else + echo "Release branch not found, skipping." + fi diff --git a/.github/workflows/validate-pr.yaml b/.github/workflows/validate-pr.yaml new file mode 100644 index 0000000..7ebceec --- /dev/null +++ b/.github/workflows/validate-pr.yaml @@ -0,0 +1,26 @@ +name: Validate pull request + +on: + pull_request: + branches: + - master + - main + - release + types: + - edited + - opened + - reopened + - synchronize + paths: + - src/** + - pyproject.toml + +jobs: + merge-forbidden: + name: Merge forbidden + runs-on: ubuntu-latest + if: ${{ github.base_ref == 'master' && github.head_ref != 'release' }} + steps: + - run: | + echo "ERROR: Merging code changes directly into master forbidden." + exit 1 diff --git a/.github/workflows/validate-repo.yaml b/.github/workflows/validate-repo.yaml new file mode 100644 index 0000000..37c9a03 --- /dev/null +++ b/.github/workflows/validate-repo.yaml @@ -0,0 +1,22 @@ +name: Validate repository +description: Confirm that the required secrets are defined in the repository + +on: + workflow_dispatch: + +jobs: + validate-secrets: + name: Validate secrets + runs-on: ubuntu-latest + strategy: + matrix: + secret_name: [MY_TOKEN, PYPI_TOKEN] + steps: + - name: Check ${{ matrix.secret_name }} secret + env: + SECRET_VALUE: ${{ secrets[matrix.secret_name] }} + if: ${{ env.SECRET_VALUE == '' }} + run: | + echo 'The secret "${{ matrix.secret_name }}" has not been defined' + echo 'Go to "settings \> secrets \> actions" to define it' + exit 1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7561275 --- /dev/null +++ b/.gitignore @@ -0,0 +1,66 @@ +# macOS +.DS_store + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.hypothesis/ + +# Virtual environments +.venv/ +venv/ +ENV/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.data/ +sandbox.ipynb + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# uv +.uv/ +uv.lock + +# pixi +pixi.lock + +# specify +specs/ + +# claude +CLAUDE.md + +# hidden +.* + +# except for... +!.coveragerc +!.github +!.gitignore diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d84ec7 --- /dev/null +++ b/README.md @@ -0,0 +1,154 @@ +# GIQL - Genomic Interval Query Language + +A SQL dialect for genomic range queries with multi-database support. + + +## Overview + +GIQL extends SQL with spatial operators for genomic interval queries. It transpiles to standard SQL that works across multiple database backends including DuckDB and SQLite. + +GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable across databases. + +## Features + +- **SQL-based**: Familiar SQL syntax with genomic extensions +- **Multi-backend**: Works with DuckDB, SQLite, and more +- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships +- **Distance operators**: DISTANCE, NEAREST for proximity queries +- **Aggregation operators**: CLUSTER, MERGE for combining intervals +- **Set quantifiers**: ANY, ALL for multi-range queries +- **Transpilation**: Convert GIQL to standard SQL for debugging or external use + +## Installation + +### From PyPI + +Install the latest stable release: + +```bash +pip install giql +``` + +Or the latest release candidate: + +```bash +pip install --pre giql +``` + +### From Source + +Clone the repository and install locally: + +```bash +# Clone the repository +git clone https://github.com/abdenlab/giql.git +cd giql + +# Install in development mode +pip install -e . + +# Or with development dependencies +pip install -e ".[dev]" +``` + +### Building Documentation + +To build the documentation locally: + +```bash +cd docs + +# Install documentation dependencies +pip install -r requirements.txt + +# Build HTML documentation +make html + +# View the documentation +# The built docs will be in docs/_build/html/ +# Open docs/_build/html/index.html in your browser +``` + +## Quick Start + +```python +from giql import GIQLEngine + +# Create engine with DuckDB backend +with GIQLEngine(target_dialect="duckdb") as engine: + # Load genomic data + engine.load_csv("variants", "variants.csv") + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + }, + genomic_column="interval", + ) + + # Query with genomic operators (returns cursor for streaming) + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Process results lazily + for row in cursor: + print(row) + + # Or just transpile to SQL without executing + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + print(sql) # See the generated SQL +``` + +## Operators at a Glance + +### Spatial Relationships + +| Operator | Description | +|----------|-------------| +| `INTERSECTS` | Returns true when ranges overlap by at least one base pair | +| `CONTAINS` | Returns true when one range fully contains another | +| `WITHIN` | Returns true when one range is fully within another | + +### Distance and Proximity + +| Operator | Description | +|----------|-------------| +| `DISTANCE` | Calculate genomic distance between two intervals | +| `NEAREST` | Find k-nearest genomic features | + +### Aggregation + +| Operator | Description | +|----------|-------------| +| `CLUSTER` | Assign cluster IDs to overlapping intervals | +| `MERGE` | Combine overlapping intervals into unified regions | + +### Set Quantifiers + +| Quantifier | Description | +|------------|-------------| +| `ANY` | Match if condition holds for any of the specified ranges | +| `ALL` | Match if condition holds for all of the specified ranges | + +## Documentation + +For complete documentation, build the docs locally (see above) or visit the hosted documentation. + +The documentation includes: + +- **Operator Reference**: Detailed documentation for each operator with examples +- **Recipes**: Common query patterns for intersections, distance calculations, and clustering +- **Bedtools Migration Guide**: How to replicate bedtools operations with GIQL +- **Guides**: Performance optimization, multi-backend configuration, and schema mapping + +## Development + +This project is in active development. diff --git a/build-hooks/__init__.py b/build-hooks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build-hooks/_git.py b/build-hooks/_git.py new file mode 100644 index 0000000..5f46ae8 --- /dev/null +++ b/build-hooks/_git.py @@ -0,0 +1,37 @@ +import git +from _version import PythonicVersion, parser + + +@parser("git") +def parse() -> str: + """ + Parses the current git repository to generate a version string. + + Returns: + A version string based on the latest tag, commit hash, and uncommitted + changes. + + Raises: + RuntimeError: If the repository is empty. + """ + repo = git.Repo(search_parent_directories=True) + if repo.bare: + raise RuntimeError(f"The repo at '{repo.working_dir}' cannot be empty!") + head_commit = repo.head.commit + try: + tag = max(repo.tags, key=lambda t: PythonicVersion.parse(str(t))) + except ValueError: + tag_name = "0" + tag_commit = None + else: + tag_name = tag.name + tag_commit = tag.commit + public, *local = tag_name.split("+") + if head_commit != tag_commit: + commit_label = repo.git.rev_parse(head_commit.hexsha, short=True) + local.append(commit_label) + dirty = repo.index.diff(None) or repo.untracked_files + if dirty: + local.append("dirty") + local = ".".join(local) + return f"{public}+{local}" if local else public diff --git a/build-hooks/_version.py b/build-hooks/_version.py new file mode 100644 index 0000000..7d0e90a --- /dev/null +++ b/build-hooks/_version.py @@ -0,0 +1,730 @@ +from __future__ import annotations + +import enum +import functools +import re +from typing import ( + TYPE_CHECKING, + Callable, + Generic, + MutableSequence, + Optional, + Type, + TypeVar, + overload, +) + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +__version_parsers__: dict = {} + + +def parser(alias: str): + """ + Registers a version parser function or callable type under a given alias. + + Examples: + >>> @parser("example") + ... def example_parser(): + ... return "1.0.0" + """ + + def decorator(decorated: Callable[[], str]) -> Callable[[], str]: + __version_parsers__[alias] = decorated + return decorated + + return decorator + + +def grammatical_series(*words: str) -> str: + """ + Formats a series of words into a grammatical series. + """ + if len(words) > 2: + separator = ", " + last = f'and "{words[-1]}"' + return separator.join([*(f'"{w}"' for w in words[:-1]), last]) + elif len(words) > 1: + return " and ".join((f'"{w}"' for w in words)) + else: + assert words, "At least one word must be provided" + return f'"{words[0]}"' + + +class NonConformingVersionString(Exception): + """ + Exception raised when a version string does not conform to the expected + pattern. + """ + + def __init__(self, version: Optional[str], pattern: re.Pattern): + super().__init__( + f"Version string must match the specified pattern, cannot parse " + f"{repr(version)} with pattern r{repr(pattern.pattern)}." + ) + + +class NonConformingVersionPattern(Exception): + """ + Exception raised when a version pattern does not define all required + capture groups. + """ + + def __init__(self, missing_capture_groups: set[str]): + super().__init__( + f"Version pattern must define all required capture groups, " + f"missing capture groups " + f"{grammatical_series(*sorted(missing_capture_groups))}." + ) + + +class NumericVersionSegment(int): + """Represents a numeric segment of a version. + + Numeric version segments behave as integers and may be incremented by + addition, e.g. `segment += 1`. + + Args: + value (Optional[int | str]): The numeric value of the segment. + format (str): The format string used to render the segment. + + Raises: + ValueError: If the value is negative. + + Examples: + >>> segment = NumericVersionSegment(1, "{}") + >>> repr(segment) + '' + >>> segment.render() + '1' + """ + + def __new__(cls, value: Optional[int | str], format: str): + if value and int(value) < 0: + raise ValueError(f"{cls.__name__} must be positive") + if value is None: + value = -1 + value = max(int(value), -1) + return super().__new__(cls, value) + + def __init__(self, value: Optional[int | str], format: str): + if value is None: + value = -1 + value = max(int(value), -1) + self.__NumericVersionSegment_value__: int = value + self.__NumericVersionSegment_format__: str = format + + def __repr__(self) -> str: + return ( + f"<{type(self).__qualname__}: {repr(self.__NumericVersionSegment_value__)}>" + ) + + def __lt__(self, value: int | AlphanumericVersionSegment) -> bool: + if isinstance(value, AlphanumericVersionSegment): + return True + return super().__lt__(value) + + def render(self) -> str: + return ( + self.__NumericVersionSegment_format__.format( + str(self.__NumericVersionSegment_value__) + ) + if self.__NumericVersionSegment_value__ > -1 + else "" + ) + + +class AlphanumericVersionSegment(str): + """Represents an alphanumeric segment of a version. + + Alphanumeric segments behave as strings and may contain numbers, letters, + or dashes ("-"). + + Args: + value (Optional[str]): The alphanumeric value of the segment. + format (str): The format string used to render the segment. + + Raises: + ValueError: If the value is an empty string or contains invalid + characters. + + Examples: + >>> segment = AlphanumericVersionSegment("alpha", "-{}") + >>> repr(segment) + '' + >>> segment.render() + '-alpha' + """ + + def __new__(cls, value: Optional[str], format: str, *, whitelist: str = ""): + if value == "": + raise ValueError(f"{cls.__name__} cannot be an empty string") + if value is None: + value = "" + if any(not v.isalnum() and v not in ("-" + whitelist) for v in value): + raise ValueError( + f"{cls.__name__} may only contain alphanumeric characters and hyphens, got {value}" + ) + return super().__new__(cls, value) + + def __init__(self, value: Optional[str], format: str, *, whitelist: str = ""): + if value is None: + value = "" + self.__AlphanumericVersionSegment_value__: str = value + self.__AlphanumericVersionSegment_format__: str = format + self.__AlphanumericVersionSegment_whitelist__: str = whitelist + + def __repr__(self) -> str: + return ( + f"<{type(self).__qualname__}: " + f"{repr(self.__AlphanumericVersionSegment_value__)}>" + ) + + def __lt__(self, value: str | NumericVersionSegment) -> bool: + if isinstance(value, NumericVersionSegment): + return False + return super().__lt__(value) + + def render(self) -> str: + return ( + self.__AlphanumericVersionSegment_format__.format( + str(self.__AlphanumericVersionSegment_value__) + ) + if self.__AlphanumericVersionSegment_value__ + else "" + ) + + +@functools.total_ordering +class PreReleaseVersionSegment: + """Represents a pre-release segment of a version. + + Args: + *values (Optional[str | int]): The values of the pre-release segment. + format (str): The format string used to render the segment. + + Examples: + >>> segment = PreReleaseVersionSegment("alpha", 1, format="-{}") + >>> repr(segment) + '' + >>> segment.render() + '-alpha.1' + """ + + def __init__(self, *values: Optional[str | int], format: str = "-{}"): + self.__PreReleaseVersionSegment_value__: list[ + AlphanumericVersionSegment | NumericVersionSegment + ] = [ + ( + NumericVersionSegment(v, format="{}") + if isinstance(v, int) + else AlphanumericVersionSegment(v, format="{}") + ) + for v in values + ] + self.__PreReleaseVersionSegment_format__: str = format + + def __repr__(self) -> str: + return ( + f"<{type(self).__qualname__}: " + f"{repr('.'.join(v.render() for v in self.__PreReleaseVersionSegment_value__))}>" + ) + + def __eq__(self, other: object): + if not isinstance(other, PreReleaseVersionSegment): + return super().__eq__(other) + else: + return ( + self.__PreReleaseVersionSegment_value__ + == other.__PreReleaseVersionSegment_value__ + ) + + def __lt__(self, other: PreReleaseVersionSegment) -> bool: + if not self.__PreReleaseVersionSegment_value__: + return False + elif not isinstance(other, PreReleaseVersionSegment): + return super().__lt__(other) + elif len(self.__PreReleaseVersionSegment_value__) < len( + other.__PreReleaseVersionSegment_value__ + ): + return True + else: + for this, that in zip( + self.__PreReleaseVersionSegment_value__, + other.__PreReleaseVersionSegment_value__, + ): + if this != that: + if isinstance(this, NumericVersionSegment) and isinstance( + that, AlphanumericVersionSegment + ): + return True + elif isinstance(this, AlphanumericVersionSegment) and isinstance( + that, NumericVersionSegment + ): + return False + else: + return this < that + return False + + def render(self) -> str: + return ( + self.__PreReleaseVersionSegment_format__.format( + ".".join(v.render() for v in self.__PreReleaseVersionSegment_value__) + ) + if self.__PreReleaseVersionSegment_value__ + else "" + ) + + +@functools.total_ordering +class ReleaseCycle(metaclass=enum.EnumMeta): + """Represents a release cycle of a version. + + Attributes: + Alpha (ReleaseCycle): Alpha release cycle. + Beta (ReleaseCycle): Beta release cycle. + ReleaseCandidate (ReleaseCycle): Release candidate cycle. + Production (ReleaseCycle): Production release cycle. + """ + + __ReleaseCycle_mapping__: dict[int | str, ReleaseCycle] = {} + __ReleaseCycle_int__: Optional[int] = None + __ReleaseCycle_str__: Optional[str] = None + + Alpha = 0, "a" + Beta = 1, "b" + ReleaseCandidate = 2, "rc" + Production = 3, "." + + def __new__(cls, ordinal: int, identifier: str): + self = object.__new__(cls) + self.__ReleaseCycle_int__ = ordinal + self.__ReleaseCycle_str__ = identifier + cls.__ReleaseCycle_mapping__.update({ordinal: self, identifier: self}) + return self + + def __lt__(self, other: ReleaseCycle) -> bool: + return int(self).__lt__(int(ReleaseCycle(other))) + + def __int__(self) -> int: + return self.__ReleaseCycle_int__ or 0 + + @classmethod + def _missing_(cls, key: int | str) -> Optional[ReleaseCycle]: + return cls.__ReleaseCycle_mapping__.get(key) + + def render(self) -> str: + return self.__ReleaseCycle_str__ or str(self) + + +VersionSegment = Optional[ + NumericVersionSegment | AlphanumericVersionSegment | ReleaseCycle +] + +segment = type("segment", (property,), {}) + + +class VersionMeta(type): + @property + def parse(cls: Self) -> VersionParser[Self]: + return VersionParser(cls) + + @property + def segments(cls: Self) -> dict[str, segment]: + return {k: v for k, v in cls.__dict__.items() if isinstance(v, segment)} + + +class Version(metaclass=VersionMeta): + """ + Base class for version objects. + + This class provides common functionality for version objects, including + parsing and rendering. + """ + + PATTERN: re.Pattern + + segments: dict[str, segment] + + def __init__(self): + self._dict: dict[str, VersionSegment] = { + k: getattr(self, k) + for k, v in type(self).__dict__.items() + if isinstance(v, segment) + } + + def __iter__(self): + return iter(self._dict) + + def __repr__(self) -> str: + return f"<{type(self).__qualname__}: {repr(str(self))}>" + + def __str__(self) -> str: + return "".join(v.render() for v in self.values() if v is not None) + + @overload + def __getitem__(self, item: slice) -> dict[str, VersionSegment]: ... + + @overload + def __getitem__(self, item: str) -> VersionSegment: ... + + def __getitem__(self, item): + if isinstance(item, slice): + segments = list(type(self).segments.values()) + if item.start: + start = segments.index(item.start) + else: + start = None + if item.stop: + stop = segments.index(item.stop) + if stop > len(segments): + stop = None + else: + stop = None + return {k: v for k, v in list(self.items())[slice(start, stop)]} + else: + return self._dict[item] + + def keys(self): + return self._dict.keys() + + def items(self): + return self._dict.items() + + def values(self): + return self._dict.values() + + +T = TypeVar("T", bound=Version) + + +class VersionParser(Generic[T]): + """ + Parses version strings into version objects. + + This class is used to parse version strings into instances of the specified + version class. It supports custom version patterns and provides a callable + interface for parsing. + + A custom parsing pattern may optionally be specified. This pattern must + specify all named capture groups required for the version type as defined + by its "segment" properties. + + Registered parsers may be accessed as attributes, e.g. `parser.git()`. + """ + + def __init__(self, version_class: Type[T]): + self.version_class = version_class + + def __call__(self, version: str, *, pattern: re.Pattern | None = None) -> T: + if pattern is not None: + missing_capture_groups = set( + self.version_class.PATTERN.groupindex.keys() + ) - set(pattern.groupindex.keys()) + if missing_capture_groups: + raise NonConformingVersionPattern(missing_capture_groups) + else: + pattern = self.version_class.PATTERN + match = pattern.match(version) + if not match: + raise NonConformingVersionString(version, pattern) + segments: dict[str, str] = { + k: v for k, v in match.groupdict().items() if v is not None + } + return self.version_class(**segments) + + def __getattr__(self, attribute): + return lambda: self(__version_parsers__[attribute]()) + + +@functools.total_ordering +class PythonicVersion(Version): + """ + Class representing a Pythonic version as described by PEP 440. + + PEP 440 is a standard for versioning Python projects. This class provides + methods to parse, compare, and render versions according to PEP 440. + + Examples: + >>> version = PythonicVersion(major_release=1, minor_release=0, patch_release=0) + >>> repr(version) + "" + >>> str(version) + '1.0.0' + >>> version.minor_release += 1 + >>> str(version) + '1.1.0' + """ + + PATTERN: re.Pattern = re.compile( + r"v?" + r"((?P\d+)(?:!))?" + r"(?P\d+)?" + r"(?:(?:\.)(?P\d+))?" + r"((?P\.|a|b|rc)" + r"(?P\d+))?" + r"((?:\.post)(?P\d+))?" + r"((?:\.dev)(?P\d+))?" + r"((?:\+)(?P[a-zA-Z0-9.]+))?" + r"$" + ) + + if TYPE_CHECKING: + parse: VersionParser[PythonicVersion] + + def __init__( + self, + epoch: Optional[int | str] = None, + major_release: int | str = 0, + minor_release: Optional[int | str] = None, + release_cycle: Optional[int | str | ReleaseCycle] = None, + patch_release: Optional[int | str] = None, + post_release: Optional[int | str] = None, + dev_release: Optional[int | str] = None, + local_identifier: Optional[str] = None, + ): + assert major_release is not None, "Major release must be defined" + if patch_release is not None: + assert minor_release is not None, ( + "Minor release must be defined if patch release is defined" + ) + assert (release_cycle is None) == (patch_release is None), ( + "Patch release and release cycle must be defined together" + ) + self._epoch = epoch + self._major_release = major_release + self._minor_release = minor_release + self._release_cycle = release_cycle + self._patch_release = patch_release + self._post_release = post_release + self._dev_release = dev_release + self._local_identifier = local_identifier or None + super().__init__() + + def __eq__(self, other: object): + if not isinstance(other, PythonicVersion): + return super().__eq__(other) + else: + return all( + ( + self.epoch == other.epoch, + self.major_release == other.major_release, + self.minor_release == other.minor_release, + self.release_cycle == other.release_cycle, + self.patch_release == other.patch_release, + self.post_release == other.post_release, + self.dev_release == other.dev_release, + self.local_identifier == other.local_identifier, + ) + ) + + def __lt__(self, other: PythonicVersion) -> bool: + if isinstance(other, PythonicVersion): + for segment in PythonicVersion.segments: + this, that = getattr(self, segment), getattr(other, segment) + if this is None: + this = -1 + if that is None: + that = -1 + if this != that: + return this < that + return False + else: + return super().__lt__(other) + + @segment + def epoch(self) -> Optional[NumericVersionSegment]: + return ( + self._epoch + if self._epoch is None + else NumericVersionSegment(self._epoch, format="{}!") + ) + + @segment + def major_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._major_release, format="{}") + + @segment + def minor_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._minor_release, format=".{}") + + @segment + def release_cycle(self) -> Optional[ReleaseCycle]: + if self._release_cycle is None: + return self._release_cycle + else: + return ReleaseCycle(self._release_cycle) + + @segment + def patch_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._patch_release, format="{}") + + @segment + def post_release(self) -> Optional[NumericVersionSegment]: + return ( + self._post_release + if self._post_release is None + else NumericVersionSegment(self._post_release, format=".post{}") + ) + + @segment + def dev_release(self) -> Optional[NumericVersionSegment]: + return ( + self._dev_release + if self._dev_release is None + else NumericVersionSegment(self._dev_release, format=".dev{}") + ) + + @segment + def local_identifier(self) -> Optional[AlphanumericVersionSegment]: + return ( + self._local_identifier + if self._local_identifier is None + else AlphanumericVersionSegment( + self._local_identifier, format="+{}", whitelist="." + ) + ) + + @property + def local(self) -> str: + return "".join( + v.render() + for v in self[type(self).local_identifier :].values() + if v is not None + ) + + @property + def public(self) -> str: + return "".join( + v.render() + for v in self[: type(self).local_identifier].values() + if v is not None + ) + + +@functools.total_ordering +class SemanticVersion(Version): + """ + Class representing a semantic version as described by SemVer 2.0. + + This class provides methods to parse, compare, and render versions + according to the SemVer 2.0 specification. + + Examples: + >>> version = SemanticVersion(major_release=1, minor_release=0, patch_release=0) + >>> str(version) + '1.0.0' + >>> version.minor_release += 1 + >>> str(version) + '1.1.0' + """ + + PATTERN: re.Pattern = re.compile( + r"^v?" + r"(?P0|[1-9]\d*)" + r"\.(?P0|[1-9]\d*)" + r"\.(?P0|[1-9]\d*)" + r"(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" + r"(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?" + r"$" + ) + + def __init__( + self, + major_release: int | str = 0, + minor_release: Optional[int | str] = None, + patch_release: Optional[int | str] = None, + pre_release: Optional[str | MutableSequence[int | str]] = None, + build: Optional[int | str] = None, + ): + assert major_release is not None, "Major release must be defined" + if patch_release is not None: + assert minor_release is not None, ( + "Minor release must be defined if patch release is defined" + ) + self._major_release = major_release + self._minor_release = minor_release + self._patch_release = patch_release + self._pre_release = ( + pre_release.split(".") + if isinstance(pre_release, str) + else [pre_release] + if isinstance(pre_release, int) + else pre_release + ) + self._build = build + super().__init__() + + def __eq__(self, other): + if not isinstance(other, SemanticVersion): + return super().__eq__(other) + else: + return all( + ( + self.major_release == other.major_release, + self.minor_release == other.minor_release, + self.patch_release == other.patch_release, + self.pre_release == other.pre_release, + self.build == other.build, + ) + ) + + def __lt__(self, other: SemanticVersion) -> bool: + if isinstance(other, SemanticVersion): + for segment in SemanticVersion.segments: + this, that = getattr(self, segment), getattr(other, segment) + if segment == "pre_release": + if this is None: + this = PreReleaseVersionSegment(format="{}") + if that is None: + that = PreReleaseVersionSegment(format="{}") + if this != that: + return this < that + else: + if this is None: + this = -1 + if that is None: + that = -1 + if this != that: + return this < that + return False + else: + return super().__lt__(other) + + @segment + def major_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._major_release, format="{}") + + @segment + def minor_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._minor_release, format=".{}") + + @segment + def patch_release(self) -> Optional[NumericVersionSegment]: + return NumericVersionSegment(self._patch_release, format=".{}") + + @segment + def pre_release( + self, + ) -> Optional[PreReleaseVersionSegment]: + if self._pre_release: + return PreReleaseVersionSegment(*self._pre_release, format="-{}") + + @segment + def build(self) -> Optional[AlphanumericVersionSegment]: + return ( + self._build + if self._build is None + else AlphanumericVersionSegment(str(self._build), format="+{}") + ) + + @property + def local(self) -> str: + return "" + + @property + def public(self) -> str: + return "".join(v.render() for v in self.values() if v is not None) diff --git a/build-hooks/metadata.py b/build-hooks/metadata.py new file mode 100644 index 0000000..56084f7 --- /dev/null +++ b/build-hooks/metadata.py @@ -0,0 +1,24 @@ +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +import _git as _git +import _version +from hatchling.metadata.plugin.interface import MetadataHookInterface + + +def _get_subpackages(directory): + subpackages = [] + for entry in os.scandir(directory): + if entry.is_dir() and entry.name.startswith("wool"): + subpackages.append(entry.name) + return subpackages + + +class MetadataHook(MetadataHookInterface): + PLUGIN_NAME = "wool-metadata" + + def update(self, metadata): + version = _version.PythonicVersion.parse.git() + metadata["version"] = str(version) diff --git a/demo.ipynb b/demo.ipynb new file mode 100644 index 0000000..4ab1d96 --- /dev/null +++ b/demo.ipynb @@ -0,0 +1,3722 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GIQL operators demo using DuckDB query engine\n", + "\n", + "This notebook demonstrates the 7 core GIQL operators:\n", + "- **Binary predicates**: INTERSECTS, WITHIN, CONTAINS\n", + "- **Aggregation operators**: MERGE, CLUSTER\n", + "- **UDF operators**: DISTANCE\n", + "- **Table-valued functions**: NEAREST\n", + "\n", + "For each operator, we:\n", + "1. Create a GIQL query\n", + "2. Transpile it to standard SQL using GIQL\n", + "3. Execute the SQL using DuckDB" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import sqlparse\n", + "\n", + "from giql import GIQLEngine" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load BED files\n", + "\n", + "Load the two ENCODE ChIP-seq peak files as Polars DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Features A: 60893 intervals\n", + "Features B: 46196 intervals\n", + "\n", + "Features A preview:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 10)
chromosomestart_posend_posnamescorestrandsignal_valuep_valueq_valuepeak
stri64i64stri64strf64f64f64i64
"chr17"2738103227381306"."1000"."461.10586-1.05.08726136
"chr7"2556593225566199"."1000"."457.60096-1.05.08726133
"chr7"143215117143215391"."1000"."443.34712-1.05.08726139
"chr2"164600030164600306"."1000"."442.64446-1.05.08726137
"chr15"5624604656246359"."1000"."438.56423-1.05.08726154
" + ], + "text/plain": [ + "shape: (5, 10)\n", + "┌────────────┬───────────┬───────────┬──────┬───┬──────────────┬─────────┬─────────┬──────┐\n", + "│ chromosome ┆ start_pos ┆ end_pos ┆ name ┆ … ┆ signal_value ┆ p_value ┆ q_value ┆ peak │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ i64 ┆ str ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │\n", + "╞════════════╪═══════════╪═══════════╪══════╪═══╪══════════════╪═════════╪═════════╪══════╡\n", + "│ chr17 ┆ 27381032 ┆ 27381306 ┆ . ┆ … ┆ 461.10586 ┆ -1.0 ┆ 5.08726 ┆ 136 │\n", + "│ chr7 ┆ 25565932 ┆ 25566199 ┆ . ┆ … ┆ 457.60096 ┆ -1.0 ┆ 5.08726 ┆ 133 │\n", + "│ chr7 ┆ 143215117 ┆ 143215391 ┆ . ┆ … ┆ 443.34712 ┆ -1.0 ┆ 5.08726 ┆ 139 │\n", + "│ chr2 ┆ 164600030 ┆ 164600306 ┆ . ┆ … ┆ 442.64446 ┆ -1.0 ┆ 5.08726 ┆ 137 │\n", + "│ chr15 ┆ 56246046 ┆ 56246359 ┆ . ┆ … ┆ 438.56423 ┆ -1.0 ┆ 5.08726 ┆ 154 │\n", + "└────────────┴───────────┴───────────┴──────┴───┴──────────────┴─────────┴─────────┴──────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load BED files with standard BED column names\n", + "bed_schema = {\n", + " \"chromosome\": pl.Utf8,\n", + " \"start_pos\": pl.Int64,\n", + " \"end_pos\": pl.Int64,\n", + " \"name\": pl.Utf8,\n", + " \"score\": pl.Int64,\n", + " \"strand\": pl.Utf8,\n", + " \"signal_value\": pl.Float64,\n", + " \"p_value\": pl.Float64,\n", + " \"q_value\": pl.Float64,\n", + " \"peak\": pl.Int64,\n", + "}\n", + "\n", + "# Load first BED file\n", + "features_a = pl.read_csv(\n", + " \".ENCFF199YFA.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n", + ")\n", + "\n", + "# Load second BED file\n", + "features_b = pl.read_csv(\n", + " \".ENCFF205OKL.bed\", separator=\"\\t\", has_header=False, schema=bed_schema\n", + ")\n", + "\n", + "print(f\"Features A: {len(features_a)} intervals\")\n", + "print(f\"Features B: {len(features_b)} intervals\")\n", + "print(\"\\nFeatures A preview:\")\n", + "features_a.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup GIQL engine\n", + "\n", + "Create a GIQL engine and register the table schemas." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GIQL engine configured successfully\n" + ] + } + ], + "source": [ + "# Create GIQL engine with DuckDB backend\n", + "engine = GIQLEngine(target_dialect=\"duckdb\", verbose=False)\n", + "\n", + "# Register DataFrames\n", + "engine.conn.register(\"features_a\", features_a.to_pandas())\n", + "engine.conn.register(\"features_b\", features_b.to_pandas())\n", + "\n", + "# Register schemas with genomic columns\n", + "schema_dict = {\n", + " \"chromosome\": \"VARCHAR\",\n", + " \"start_pos\": \"BIGINT\",\n", + " \"end_pos\": \"BIGINT\",\n", + " \"name\": \"VARCHAR\",\n", + " \"score\": \"BIGINT\",\n", + " \"strand\": \"VARCHAR\",\n", + " \"signal_value\": \"DOUBLE\",\n", + " \"p_value\": \"DOUBLE\",\n", + " \"q_value\": \"DOUBLE\",\n", + " \"peak\": \"BIGINT\",\n", + "}\n", + "\n", + "for table in [\"features_a\", \"features_b\"]:\n", + " engine.register_table_schema(table, schema_dict, genomic_column=\"position\")\n", + "\n", + "print(\"GIQL engine configured successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 1. INTERSECTS operator\n", + "\n", + "Find features in A that **overlap** with features in B.\n", + "\n", + "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position INTERSECTS b.position`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT DISTINCT a.*\n", + "FROM\n", + " features_a AS a,\n", + " features_b AS b\n", + "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", + " AND a.\"start_pos\" < b.\"end_pos\"\n", + " AND a.\"end_pos\" > b.\"start_pos\")\n", + "\n", + "================================================================================\n", + "\n", + "Result: 42616 overlapping intervals from A\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_posnamescorestrandsignal_valuep_valueq_valuepeak
0chr221995370119953957.1000.390.93901-1.05.08726131
1chr137554927575549497.1000.314.02895-1.05.08726111
2chr205293793052938139.1000.242.96853-1.05.08726110
3chr167758431877584523.1000.237.00123-1.05.08726107
4chr168140361281403816.1000.244.89039-1.05.08726103
5chr99741176597412044.1000.328.74736-1.05.08726129
6chr9137236518137236806.1000.323.80804-1.05.08726138
7chr206353798963538200.1000.272.24062-1.05.0872698
8chr108637727586377511.1000.256.93284-1.05.08726125
9chr157763350077633787.1000.380.81693-1.05.08726140
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos name score strand signal_value p_value \\\n", + "0 chr22 19953701 19953957 . 1000 . 390.93901 -1.0 \n", + "1 chr13 75549275 75549497 . 1000 . 314.02895 -1.0 \n", + "2 chr20 52937930 52938139 . 1000 . 242.96853 -1.0 \n", + "3 chr16 77584318 77584523 . 1000 . 237.00123 -1.0 \n", + "4 chr16 81403612 81403816 . 1000 . 244.89039 -1.0 \n", + "5 chr9 97411765 97412044 . 1000 . 328.74736 -1.0 \n", + "6 chr9 137236518 137236806 . 1000 . 323.80804 -1.0 \n", + "7 chr20 63537989 63538200 . 1000 . 272.24062 -1.0 \n", + "8 chr10 86377275 86377511 . 1000 . 256.93284 -1.0 \n", + "9 chr15 77633500 77633787 . 1000 . 380.81693 -1.0 \n", + "\n", + " q_value peak \n", + "0 5.08726 131 \n", + "1 5.08726 111 \n", + "2 5.08726 110 \n", + "3 5.08726 107 \n", + "4 5.08726 103 \n", + "5 5.08726 129 \n", + "6 5.08726 138 \n", + "7 5.08726 98 \n", + "8 5.08726 125 \n", + "9 5.08726 140 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query\n", + "giql_query = \"\"\"\n", + " SELECT DISTINCT a.*\n", + " FROM features_a a, features_b b\n", + " WHERE a.position INTERSECTS b.position\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} overlapping intervals from A\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### High overlap depth regions\n", + "\n", + "Find regions with high overlap depth (where many intervals overlap) using a self-join." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "WITH overlap_counts AS\n", + " (SELECT\n", + " a.chromosome,\n", + " a.start_pos,\n", + " a.end_pos,\n", + " a.signal_value,\n", + " COUNT(*) AS depth\n", + " FROM features_b AS a\n", + " INNER JOIN features_a AS b ON (a.\"chromosome\" = b.\"chromosome\"\n", + " AND a.\"start_pos\" < b.\"end_pos\"\n", + " AND a.\"end_pos\" > b.\"start_pos\")\n", + " GROUP BY\n", + " a.chromosome,\n", + " a.start_pos,\n", + " a.end_pos,\n", + " a.signal_value)\n", + "SELECT *\n", + "FROM overlap_counts\n", + "WHERE depth >= 2\n", + "ORDER BY depth DESC\n", + "\n", + "================================================================================\n", + "\n", + "Result: 109 intervals with overlap depth >= 5\n", + "Max overlap depth: 2\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_possignal_valuedepth
0chr221954156221954178666.904322
1chr17459716614597197118.473662
2chr1576059040760593503.488732
3chrX10727237410727268421.601522
4chr6132279569132280035151.440082
5chr11659323206593263011.782412
6chr1410599944110599960337.336852
7chr2170928770170929192167.700192
8chr7474750794747538917.008782
9chr21761407011761410115.542952
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos signal_value depth\n", + "0 chr2 219541562 219541786 66.90432 2\n", + "1 chr17 45971661 45971971 18.47366 2\n", + "2 chr15 76059040 76059350 3.48873 2\n", + "3 chrX 107272374 107272684 21.60152 2\n", + "4 chr6 132279569 132280035 151.44008 2\n", + "5 chr11 65932320 65932630 11.78241 2\n", + "6 chr14 105999441 105999603 37.33685 2\n", + "7 chr2 170928770 170929192 167.70019 2\n", + "8 chr7 47475079 47475389 17.00878 2\n", + "9 chr2 176140701 176141011 5.54295 2" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query to find high-depth overlaps\n", + "giql_query = \"\"\"\n", + " WITH overlap_counts AS (\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos,\n", + " a.end_pos,\n", + " a.signal_value,\n", + " COUNT(*) as depth\n", + " FROM features_b a\n", + " INNER JOIN features_a b ON a.position INTERSECTS b.position\n", + " GROUP BY a.chromosome, a.start_pos, a.end_pos, a.signal_value\n", + " )\n", + " SELECT *\n", + " FROM overlap_counts\n", + " WHERE depth >= 2\n", + " ORDER BY depth DESC\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} intervals with overlap depth >= 5\")\n", + "print(f\"Max overlap depth: {result['depth'].max() if len(result) > 0 else 0}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 2. WITHIN operator\n", + "\n", + "Find features in A that are **completely contained within** features in B.\n", + "\n", + "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position WITHIN b.position`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT DISTINCT a.*\n", + "FROM\n", + " features_a AS a,\n", + " features_b AS b\n", + "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", + " AND a.\"start_pos\" >= b.\"start_pos\"\n", + " AND a.\"end_pos\" <= b.\"end_pos\")\n", + "\n", + "================================================================================\n", + "\n", + "Result: 30806 intervals from A contained within B\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_posnamescorestrandsignal_valuep_valueq_valuepeak
0chr205293793052938139.1000.242.96853-1.05.08726110
1chr168140361281403816.1000.244.89039-1.05.08726103
2chr2032501353250335.1000.274.74729-1.05.0872695
3chr99263477092634990.1000.313.09355-1.05.08726113
4chr93372216233722428.1000.391.35401-1.05.08726132
5chr82290906922909276.1000.245.13028-1.05.08726107
6chr4182925334182925586.1000.284.15090-1.05.08726135
7chr175132246951322723.1000.383.45857-1.05.08726127
8chr54362709643627315.1000.304.51285-1.05.08726115
9chr1124197242419926.1000.247.25473-1.05.08726104
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos name score strand signal_value p_value \\\n", + "0 chr20 52937930 52938139 . 1000 . 242.96853 -1.0 \n", + "1 chr16 81403612 81403816 . 1000 . 244.89039 -1.0 \n", + "2 chr20 3250135 3250335 . 1000 . 274.74729 -1.0 \n", + "3 chr9 92634770 92634990 . 1000 . 313.09355 -1.0 \n", + "4 chr9 33722162 33722428 . 1000 . 391.35401 -1.0 \n", + "5 chr8 22909069 22909276 . 1000 . 245.13028 -1.0 \n", + "6 chr4 182925334 182925586 . 1000 . 284.15090 -1.0 \n", + "7 chr17 51322469 51322723 . 1000 . 383.45857 -1.0 \n", + "8 chr5 43627096 43627315 . 1000 . 304.51285 -1.0 \n", + "9 chr11 2419724 2419926 . 1000 . 247.25473 -1.0 \n", + "\n", + " q_value peak \n", + "0 5.08726 110 \n", + "1 5.08726 103 \n", + "2 5.08726 95 \n", + "3 5.08726 113 \n", + "4 5.08726 132 \n", + "5 5.08726 107 \n", + "6 5.08726 135 \n", + "7 5.08726 127 \n", + "8 5.08726 115 \n", + "9 5.08726 104 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query\n", + "giql_query = \"\"\"\n", + " SELECT DISTINCT a.*\n", + " FROM features_a a, features_b b\n", + " WHERE a.position WITHIN b.position\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} intervals from A contained within B\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 3. CONTAINS operator\n", + "\n", + "Find features in A that **completely contain** features in B.\n", + "\n", + "**GIQL Query**: `SELECT a.* FROM features_a a, features_b b WHERE a.position CONTAINS b.position`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT DISTINCT a.*\n", + "FROM\n", + " features_a AS a,\n", + " features_b AS b\n", + "WHERE (a.\"chromosome\" = b.\"chromosome\"\n", + " AND a.\"start_pos\" <= b.\"start_pos\"\n", + " AND a.\"end_pos\" >= b.\"end_pos\")\n", + "\n", + "================================================================================\n", + "\n", + "Result: 4917 intervals from A that contain B\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_posnamescorestrandsignal_valuep_valueq_valuepeak
0chr134809509248095340.1000.376.88412-1.05.08726119
1chr201283834612838555.1000.275.52141-1.05.0872698
2chrX1153319711533459.1000.233.20172-1.05.08726102
3chr205922258059222781.1000.248.00196-1.05.08726108
4chr172918031729180565.1000.323.60178-1.05.08726125
5chr203905014339050364.1000.286.34142-1.05.08726110
6chr173576413035764337.1000.262.41330-1.05.08726105
7chrX99958269996089.1000.367.15975-1.05.08726126
8chrX1294682612947077.1000.371.63833-1.05.08726126
9chr172914892729149149.1000.233.84271-1.05.08726107
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos name score strand signal_value p_value \\\n", + "0 chr13 48095092 48095340 . 1000 . 376.88412 -1.0 \n", + "1 chr20 12838346 12838555 . 1000 . 275.52141 -1.0 \n", + "2 chrX 11533197 11533459 . 1000 . 233.20172 -1.0 \n", + "3 chr20 59222580 59222781 . 1000 . 248.00196 -1.0 \n", + "4 chr17 29180317 29180565 . 1000 . 323.60178 -1.0 \n", + "5 chr20 39050143 39050364 . 1000 . 286.34142 -1.0 \n", + "6 chr17 35764130 35764337 . 1000 . 262.41330 -1.0 \n", + "7 chrX 9995826 9996089 . 1000 . 367.15975 -1.0 \n", + "8 chrX 12946826 12947077 . 1000 . 371.63833 -1.0 \n", + "9 chr17 29148927 29149149 . 1000 . 233.84271 -1.0 \n", + "\n", + " q_value peak \n", + "0 5.08726 119 \n", + "1 5.08726 98 \n", + "2 5.08726 102 \n", + "3 5.08726 108 \n", + "4 5.08726 125 \n", + "5 5.08726 110 \n", + "6 5.08726 105 \n", + "7 5.08726 126 \n", + "8 5.08726 126 \n", + "9 5.08726 107 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query\n", + "giql_query = \"\"\"\n", + " SELECT DISTINCT a.*\n", + " FROM features_a a, features_b b\n", + " WHERE a.position CONTAINS b.position\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} intervals from A that contain B\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 4. MERGE operator\n", + "\n", + "**Combine overlapping intervals** from features_a into merged regions.\n", + "\n", + "Similar to `bedtools merge`, this collapses overlapping genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT MERGE(position) FROM features_a`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " \"chromosome\",\n", + " MIN(\"start_pos\") AS start_pos,\n", + " MAX(\"end_pos\") AS end_pos\n", + "FROM\n", + " (SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", + " FROM\n", + " (SELECT\n", + " *,\n", + " CASE\n", + " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b) AS lag_calc) AS clustered\n", + "GROUP BY\n", + " chromosome,\n", + " __giql_cluster_id\n", + "ORDER BY\n", + " \"chromosome\" NULLS LAST,\n", + " \"start_pos\" NULLS LAST\n", + "\n", + "================================================================================\n", + "\n", + "Result: 45630 merged intervals (from 60893 original)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_pos
0chr1186654186964
1chr1267979268128
2chr1850424850734
3chr1869711870021
4chr1912864913174
5chr1919635919945
6chr1931626931936
7chr1938195938352
8chr1951414951724
9chr1976080976390
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos\n", + "0 chr1 186654 186964\n", + "1 chr1 267979 268128\n", + "2 chr1 850424 850734\n", + "3 chr1 869711 870021\n", + "4 chr1 912864 913174\n", + "5 chr1 919635 919945\n", + "6 chr1 931626 931936\n", + "7 chr1 938195 938352\n", + "8 chr1 951414 951724\n", + "9 chr1 976080 976390" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - basic merge\n", + "giql_query = \"\"\"\n", + " SELECT MERGE(position)\n", + " FROM features_b\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} merged intervals (from {len(features_a)} original)\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MERGE with aggregation CTE\n", + "\n", + "Compute statistics on merged intervals and filter for intervals that merged multiple features." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "WITH merged_intervals AS\n", + " (SELECT\n", + " \"chromosome\",\n", + " MIN(\"start_pos\") AS start_pos,\n", + " MAX(\"end_pos\") AS end_pos,\n", + " COUNT(*) AS interval_count,\n", + " AVG(signal_value) AS avg_signal\n", + " FROM\n", + " (SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", + " FROM\n", + " (SELECT\n", + " *,\n", + " CASE\n", + " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b) AS lag_calc) AS clustered\n", + " GROUP BY\n", + " chromosome,\n", + " __giql_cluster_id\n", + " ORDER BY\n", + " \"chromosome\" NULLS LAST,\n", + " \"start_pos\" NULLS LAST)\n", + "SELECT *\n", + "FROM merged_intervals\n", + "WHERE interval_count > 1\n", + "\n", + "================================================================================\n", + "\n", + "Result: 559 merged intervals with more than 1 original interval\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_posinterval_countavg_signal
0chr113000731300592214.844510
1chr183749128375330228.608395
2chr197173079717819213.833490
3chr1147829211478343827.107920
4chr1183621411836267729.312525
5chr11944577319446283216.590025
6chr12395887423959385210.616965
7chr12415796824158482211.985625
8chr12739281727393359211.789830
9chr13010614330106664231.694800
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos interval_count avg_signal\n", + "0 chr1 1300073 1300592 2 14.844510\n", + "1 chr1 8374912 8375330 2 28.608395\n", + "2 chr1 9717307 9717819 2 13.833490\n", + "3 chr1 14782921 14783438 2 7.107920\n", + "4 chr1 18362141 18362677 2 9.312525\n", + "5 chr1 19445773 19446283 2 16.590025\n", + "6 chr1 23958874 23959385 2 10.616965\n", + "7 chr1 24157968 24158482 2 11.985625\n", + "8 chr1 27392817 27393359 2 11.789830\n", + "9 chr1 30106143 30106664 2 31.694800" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query with aggregations in a CTE\n", + "giql_query = \"\"\"\n", + " WITH merged_intervals AS (\n", + " SELECT\n", + " MERGE(position),\n", + " COUNT(*) as interval_count,\n", + " AVG(signal_value) as avg_signal\n", + " FROM features_b\n", + " )\n", + " SELECT *\n", + " FROM merged_intervals\n", + " WHERE interval_count > 1\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} merged intervals with more than 1 original interval\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MERGE with distance parameter\n", + "\n", + "Merge intervals within 1000bp of each other." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " \"chromosome\",\n", + " MIN(\"start_pos\") AS start_pos,\n", + " MAX(\"end_pos\") AS end_pos\n", + "FROM\n", + " (SELECT\n", + " *,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) AS __giql_cluster_id\n", + " FROM\n", + " (SELECT\n", + " *,\n", + " CASE\n", + " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) + 10000 >= \"start_pos\" THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_b) AS lag_calc) AS clustered\n", + "GROUP BY\n", + " chromosome,\n", + " __giql_cluster_id\n", + "ORDER BY\n", + " \"chromosome\" NULLS LAST,\n", + " \"start_pos\" NULLS LAST\n", + "\n", + "================================================================================\n", + "\n", + "Result: 34097 merged intervals (1000bp distance)\n", + "Compared to: 60893 original intervals\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_pos
0chr1186654186964
1chr1267979268128
2chr1850424850734
3chr1869711870021
4chr1912864919945
5chr1931626938352
6chr1951414951724
7chr1976080984477
8chr110139871015656
9chr110637871064097
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos\n", + "0 chr1 186654 186964\n", + "1 chr1 267979 268128\n", + "2 chr1 850424 850734\n", + "3 chr1 869711 870021\n", + "4 chr1 912864 919945\n", + "5 chr1 931626 938352\n", + "6 chr1 951414 951724\n", + "7 chr1 976080 984477\n", + "8 chr1 1013987 1015656\n", + "9 chr1 1063787 1064097" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - merge with distance parameter\n", + "giql_query = \"\"\"\n", + " SELECT MERGE(position, 10000)\n", + " FROM features_b\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} merged intervals (1000bp distance)\")\n", + "print(f\"Compared to: {len(features_a)} original intervals\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 5. CLUSTER operator\n", + "\n", + "**Assign cluster IDs** to overlapping intervals from features_a.\n", + "\n", + "Similar to `bedtools cluster`, this groups overlapping genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT *, CLUSTER(position) AS cluster_id FROM features_a`" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) AS cluster_id\n", + "FROM\n", + " (SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " CASE\n", + " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) >= \"start_pos\" THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_a) AS lag_calc\n", + "ORDER BY\n", + " chromosome,\n", + " start_pos\n", + "\n", + "================================================================================\n", + "\n", + "Result: 60893 intervals with cluster assignments\n", + "Number of unique clusters: 5495\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_possignal_valuecluster_id
0chr118136818156427.547961.0
1chr118665018684619.930982.0
2chr126790926810588.008633.0
3chr158610658630235.020274.0
4chr172926172945723.294155.0
5chr177881277900856.976636.0
6chr185047385066927.082437.0
7chr185805685825215.558698.0
8chr1869860869991168.872949.0
9chr1904689904883167.5689710.0
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos signal_value cluster_id\n", + "0 chr1 181368 181564 27.54796 1.0\n", + "1 chr1 186650 186846 19.93098 2.0\n", + "2 chr1 267909 268105 88.00863 3.0\n", + "3 chr1 586106 586302 35.02027 4.0\n", + "4 chr1 729261 729457 23.29415 5.0\n", + "5 chr1 778812 779008 56.97663 6.0\n", + "6 chr1 850473 850669 27.08243 7.0\n", + "7 chr1 858056 858252 15.55869 8.0\n", + "8 chr1 869860 869991 168.87294 9.0\n", + "9 chr1 904689 904883 167.56897 10.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - basic clustering\n", + "giql_query = \"\"\"\n", + " SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " CLUSTER(position) AS cluster_id\n", + " FROM features_a\n", + " ORDER BY chromosome, start_pos\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} intervals with cluster assignments\")\n", + "print(f\"Number of unique clusters: {result['cluster_id'].nunique()}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CLUSTER with distance parameter\n", + "\n", + "Cluster intervals within 1000bp of each other." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " SUM(is_new_cluster) OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) AS cluster_id\n", + "FROM\n", + " (SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " CASE\n", + " WHEN LAG(\"end_pos\") OVER (PARTITION BY \"chromosome\"\n", + " ORDER BY \"start_pos\" NULLS LAST) + 10000 >= \"start_pos\" THEN 0\n", + " ELSE 1\n", + " END AS is_new_cluster\n", + " FROM features_a) AS lag_calc\n", + "ORDER BY\n", + " chromosome,\n", + " start_pos\n", + "\n", + "================================================================================\n", + "\n", + "Result: 60893 intervals with cluster assignments (1000bp distance)\n", + "Number of unique clusters: 3692\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomestart_posend_possignal_valuecluster_id
0chr118136818156427.547961.0
1chr118665018684619.930981.0
2chr126790926810588.008632.0
3chr158610658630235.020273.0
4chr172926172945723.294154.0
5chr177881277900856.976635.0
6chr185047385066927.082436.0
7chr185805685825215.558696.0
8chr1869860869991168.872947.0
9chr1904689904883167.568978.0
\n", + "
" + ], + "text/plain": [ + " chromosome start_pos end_pos signal_value cluster_id\n", + "0 chr1 181368 181564 27.54796 1.0\n", + "1 chr1 186650 186846 19.93098 1.0\n", + "2 chr1 267909 268105 88.00863 2.0\n", + "3 chr1 586106 586302 35.02027 3.0\n", + "4 chr1 729261 729457 23.29415 4.0\n", + "5 chr1 778812 779008 56.97663 5.0\n", + "6 chr1 850473 850669 27.08243 6.0\n", + "7 chr1 858056 858252 15.55869 6.0\n", + "8 chr1 869860 869991 168.87294 7.0\n", + "9 chr1 904689 904883 167.56897 8.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query with distance parameter\n", + "giql_query = \"\"\"\n", + " SELECT\n", + " chromosome,\n", + " start_pos,\n", + " end_pos,\n", + " signal_value,\n", + " CLUSTER(position, 10000) AS cluster_id\n", + " FROM features_a\n", + " ORDER BY chromosome, start_pos\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} intervals with cluster assignments (1000bp distance)\")\n", + "print(f\"Number of unique clusters: {result['cluster_id'].nunique()}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 6. DISTANCE operator\n", + "\n", + "**Calculate genomic distances** between intervals from `features_a` and `features_b`.\n", + "\n", + "Similar to `bedtools closest -d`, this calculates the distance in base pairs between genomic intervals.\n", + "\n", + "**GIQL Query**: `SELECT a.*, b.*, DISTANCE(a.position, b.position) AS distance FROM features_a a, features_b b`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "WITH distances AS\n", + " (SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " CASE\n", + " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < b.\"end_pos\"\n", + " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", + " END AS signed_distance,\n", + " ROW_NUMBER() OVER (PARTITION BY a.chromosome, a.start_pos, a.end_pos\n", + " ORDER BY ABS(CASE\n", + " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < b.\"end_pos\"\n", + " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", + " END)) AS rank\n", + " FROM features_a AS a\n", + " CROSS JOIN features_b AS b\n", + " WHERE a.chromosome = b.chromosome\n", + " AND a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000)\n", + "SELECT\n", + " chromosome,\n", + " a_start,\n", + " a_end,\n", + " b_start,\n", + " b_end,\n", + " signed_distance,\n", + " CASE\n", + " WHEN signed_distance < 0 THEN 'upstream'\n", + " WHEN signed_distance > 0 THEN 'downstream'\n", + " ELSE 'overlap'\n", + " END AS direction\n", + "FROM distances\n", + "WHERE rank = 1\n", + "ORDER BY\n", + " chromosome,\n", + " a_start\n", + "\n", + "================================================================================\n", + "\n", + "Result: 3 features with directional distances\n", + "Upstream features: 0\n", + "Downstream features: 1\n", + "Overlapping features: 2\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_endb_startb_endsigned_distancedirection
0chr11813681815641866541869645090downstream
1chr11866501868461866541869640overlap
2chr12679092681052679792681280overlap
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end b_start b_end signed_distance direction\n", + "0 chr1 181368 181564 186654 186964 5090 downstream\n", + "1 chr1 186650 186846 186654 186964 0 overlap\n", + "2 chr1 267909 268105 267979 268128 0 overlap" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query with signed distance\n", + "# Signed distance (signed=true) returns negative values for upstream features and positive for downstream\n", + "# This is similar to bedtools closest -D ref\n", + "giql_query = \"\"\"\n", + " WITH distances AS (\n", + " SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " DISTANCE(a.position, b.position, signed=true) AS signed_distance,\n", + " ROW_NUMBER() OVER (\n", + " PARTITION BY a.chromosome, a.start_pos, a.end_pos\n", + " ORDER BY ABS(DISTANCE(a.position, b.position, signed=true))\n", + " ) AS rank\n", + " FROM features_a a\n", + " CROSS JOIN features_b b\n", + " WHERE a.chromosome = b.chromosome\n", + " AND a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + " )\n", + " SELECT \n", + " chromosome,\n", + " a_start,\n", + " a_end,\n", + " b_start,\n", + " b_end,\n", + " signed_distance,\n", + " CASE \n", + " WHEN signed_distance < 0 THEN 'upstream'\n", + " WHEN signed_distance > 0 THEN 'downstream'\n", + " ELSE 'overlap'\n", + " END AS direction\n", + " FROM distances\n", + " WHERE rank = 1\n", + " ORDER BY chromosome, a_start\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} features with directional distances\")\n", + "print(f\"Upstream features: {(result['signed_distance'] < 0).sum()}\")\n", + "print(f\"Downstream features: {(result['signed_distance'] > 0).sum()}\")\n", + "print(f\"Overlapping features: {(result['signed_distance'] == 0).sum()}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Signed Distance (Directional)\n", + "\n", + "Calculate **directional distances** where negative values indicate upstream features and positive values indicate downstream features, similar to `bedtools closest -D ref`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " CASE\n", + " WHEN a.\"chromosome\" != b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < b.\"end_pos\"\n", + " AND a.\"end_pos\" > b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= b.\"start_pos\" THEN (b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - b.\"end_pos\")\n", + " END AS distance\n", + "FROM features_a AS a\n", + "CROSS JOIN features_b AS b\n", + "WHERE a.chromosome = b.chromosome\n", + " AND a.chromosome = 'chr1'\n", + " AND a.start_pos BETWEEN 180000 AND 190000\n", + " AND b.start_pos BETWEEN 180000 AND 200000\n", + "ORDER BY\n", + " a.start_pos,\n", + " b.start_pos\n", + "\n", + "================================================================================\n", + "\n", + "Result: 2 distance calculations\n", + "\n", + "Example distances (0 = overlap, positive = gap in base pairs):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_endb_startb_enddistance
0chr11813681815641866541869645090
1chr11866501868461866541869640
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end b_start b_end distance\n", + "0 chr1 181368 181564 186654 186964 5090\n", + "1 chr1 186650 186846 186654 186964 0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - basic distance calculation\n", + "# DISTANCE() returns 0 for overlapping intervals, positive integers for gaps between intervals\n", + "# Filters to a small region on chr1 for demonstration\n", + "giql_query = \"\"\"\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " DISTANCE(a.position, b.position) AS distance\n", + " FROM features_a a\n", + " CROSS JOIN features_b b\n", + " WHERE a.chromosome = b.chromosome \n", + " AND a.chromosome = 'chr1'\n", + " AND a.start_pos BETWEEN 180000 AND 190000\n", + " AND b.start_pos BETWEEN 180000 AND 200000\n", + " ORDER BY a.start_pos, b.start_pos\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} distance calculations\")\n", + "print(\"\\nExample distances (0 = overlap, positive = gap in base pairs):\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 7. NEAREST operator\n", + "\n", + "**Find the k-nearest genomic features** using the NEAREST operator.\n", + "\n", + "Similar to `bedtools closest`, this finds the closest features to query intervals. The NEAREST operator eliminates the need to write complex window functions for k-nearest neighbor queries.\n", + "\n", + "**Key Features**:\n", + "- Find k-nearest features (k=1 for closest, k>1 for multiple neighbors)\n", + "- Directional queries with `signed=true` (upstream vs downstream)\n", + "- Distance-constrained queries with `max_distance`\n", + "- Strand-specific queries with `stranded=true`\n", + "- Implicit reference resolution in LATERAL joins (automatically uses outer table's position)\n", + "\n", + "**Database Support**:\n", + "- **DuckDB**: Full support (LATERAL joins)\n", + "- **SQLite**: Standalone mode only (no LATERAL support)\n", + "\n", + "**GIQL Query**: `SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " a.signal_value AS a_signal,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " b.signal_value AS b_signal,\n", + " distance\n", + "FROM\n", + " features_a AS a,\n", + "\n", + " (SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", + " ORDER BY ABS(CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END)\n", + " LIMIT 1) AS b\n", + "WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 1000000\n", + "ORDER BY\n", + " a.chromosome,\n", + " a.start_pos\n", + "\n", + "================================================================================\n", + "\n", + "Result: 24 features from A with their closest feature in B\n", + "Average distance to closest feature: 21573.5 bp\n", + "Features overlapping with closest: 11\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_enda_signalb_startb_endb_signaldistance
0chr118136818156427.5479618665418696414.619085090
1chr118665018684619.9309818665418696414.619080
2chr126790926810588.0086326797926812849.393660
3chr158610658630235.0202785042485073415.07029264122
4chr172926172945723.2941585042485073415.07029120967
5chr177881277900856.9766385042485073415.0702971416
6chr185047385066927.0824385042485073415.070290
7chr185805685825215.5586985042485073415.070297322
8chr1869860869991168.8729486971187002112.421560
9chr1904689904883167.5689791286491317431.078657981
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end a_signal b_start b_end b_signal distance\n", + "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", + "1 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", + "2 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", + "3 chr1 586106 586302 35.02027 850424 850734 15.07029 264122\n", + "4 chr1 729261 729457 23.29415 850424 850734 15.07029 120967\n", + "5 chr1 778812 779008 56.97663 850424 850734 15.07029 71416\n", + "6 chr1 850473 850669 27.08243 850424 850734 15.07029 0\n", + "7 chr1 858056 858252 15.55869 850424 850734 15.07029 7322\n", + "8 chr1 869860 869991 168.87294 869711 870021 12.42156 0\n", + "9 chr1 904689 904883 167.56897 912864 913174 31.07865 7981" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - find closest feature in B for each feature in A\n", + "# This replicates bedtools closest -d functionality using NEAREST(target, k=1)\n", + "# The reference position is automatically inferred from the outer table (features_a)\n", + "giql_query = \"\"\"\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " a.signal_value AS a_signal,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " b.signal_value AS b_signal,\n", + " distance\n", + " FROM features_a a, NEAREST(features_b, k=1) b\n", + " WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 1000000\n", + " ORDER BY a.chromosome, a.start_pos\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "print(f\"Result: {len(result)} features from A with their closest feature in B\")\n", + "print(f\"Average distance to closest feature: {result['distance'].mean():.1f} bp\")\n", + "print(f\"Features overlapping with closest: {(result['distance'] == 0).sum()}\")\n", + "result.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finding k-Nearest Features\n", + "\n", + "Find the **3 closest features** in B for each feature in A using `k=3`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " a.signal_value AS a_signal,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " b.signal_value AS b_signal,\n", + " distance\n", + "FROM\n", + " features_a AS a,\n", + "\n", + " (SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", + " ORDER BY ABS(CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END)\n", + " LIMIT 3) AS b\n", + "WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + "ORDER BY\n", + " a.chromosome,\n", + " a.start_pos,\n", + " distance\n", + "\n", + "================================================================================\n", + "\n", + "Result: 9 total rows (up to 3 neighbors per feature in A)\n", + "Number of features from A: 3\n", + "Average neighbors per feature: 3.0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_enda_signalb_startb_endb_signaldistance
0chr118136818156427.5479618665418696414.619085090
1chr118136818156427.5479626797926812849.3936686415
2chr118136818156427.5479685042485073415.07029668860
3chr118665018684619.9309818665418696414.619080
4chr118665018684619.9309826797926812849.3936681133
5chr118665018684619.9309885042485073415.07029663578
6chr126790926810588.0086326797926812849.393660
7chr126790926810588.0086318665418696414.6190880945
8chr126790926810588.0086385042485073415.07029582319
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end a_signal b_start b_end b_signal distance\n", + "0 chr1 181368 181564 27.54796 186654 186964 14.61908 5090\n", + "1 chr1 181368 181564 27.54796 267979 268128 49.39366 86415\n", + "2 chr1 181368 181564 27.54796 850424 850734 15.07029 668860\n", + "3 chr1 186650 186846 19.93098 186654 186964 14.61908 0\n", + "4 chr1 186650 186846 19.93098 267979 268128 49.39366 81133\n", + "5 chr1 186650 186846 19.93098 850424 850734 15.07029 663578\n", + "6 chr1 267909 268105 88.00863 267979 268128 49.39366 0\n", + "7 chr1 267909 268105 88.00863 186654 186964 14.61908 80945\n", + "8 chr1 267909 268105 88.00863 850424 850734 15.07029 582319" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - find 3 nearest features in B for each feature in A\n", + "giql_query = \"\"\"\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " a.signal_value AS a_signal,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " b.signal_value AS b_signal,\n", + " distance\n", + " FROM features_a a, NEAREST(features_b, k=3) b\n", + " WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + " ORDER BY a.chromosome, a.start_pos, distance\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "\n", + "# Group by feature A to show k neighbors per feature\n", + "grouped = result.groupby([\"a_start\", \"a_end\"]).size()\n", + "print(f\"Result: {len(result)} total rows (up to 3 neighbors per feature in A)\")\n", + "print(f\"Number of features from A: {len(grouped)}\")\n", + "print(f\"Average neighbors per feature: {grouped.mean():.1f}\")\n", + "result.head(15) # Show first 5 features with their 3 neighbors each" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Directional Queries with signed=true\n", + "\n", + "Find the **3 nearest upstream or downstream features** using `signed=true`.\n", + "\n", + "Negative distances indicate upstream features (B is before A), positive distances indicate downstream features (B is after A)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " distance,\n", + " CASE\n", + " WHEN distance < 0 THEN 'upstream'\n", + " WHEN distance > 0 THEN 'downstream'\n", + " ELSE 'overlap'\n", + " END AS direction\n", + "FROM\n", + " features_a AS a,\n", + "\n", + " (SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", + " ORDER BY ABS(CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END)\n", + " LIMIT 3) AS b\n", + "WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + "ORDER BY\n", + " a.chromosome,\n", + " a.start_pos,\n", + " ABS(distance)\n", + "\n", + "================================================================================\n", + "\n", + "Result: 9 total nearest features\n", + "Upstream features (distance < 0): 0\n", + "Downstream features (distance > 0): 7\n", + "Overlapping features (distance = 0): 2\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_endb_startb_enddistancedirection
0chr11813681815641866541869645090downstream
1chr118136818156426797926812886415downstream
2chr1181368181564850424850734668860downstream
3chr11866501868461866541869640overlap
4chr118665018684626797926812881133downstream
5chr1186650186846850424850734663578downstream
6chr12679092681052679792681280overlap
7chr126790926810518665418696480945downstream
8chr1267909268105850424850734582319downstream
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end b_start b_end distance direction\n", + "0 chr1 181368 181564 186654 186964 5090 downstream\n", + "1 chr1 181368 181564 267979 268128 86415 downstream\n", + "2 chr1 181368 181564 850424 850734 668860 downstream\n", + "3 chr1 186650 186846 186654 186964 0 overlap\n", + "4 chr1 186650 186846 267979 268128 81133 downstream\n", + "5 chr1 186650 186846 850424 850734 663578 downstream\n", + "6 chr1 267909 268105 267979 268128 0 overlap\n", + "7 chr1 267909 268105 186654 186964 80945 downstream\n", + "8 chr1 267909 268105 850424 850734 582319 downstream" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - find 3 nearest features with directional distances\n", + "# signed=true returns negative for upstream (B before A) and positive for downstream (B after A)\n", + "giql_query = \"\"\"\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " distance,\n", + " CASE\n", + " WHEN distance < 0 THEN 'upstream'\n", + " WHEN distance > 0 THEN 'downstream'\n", + " ELSE 'overlap'\n", + " END AS direction\n", + " FROM features_a a, NEAREST(features_b, k=3, signed=true) b\n", + " WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + " ORDER BY a.chromosome, a.start_pos, ABS(distance)\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "\n", + "# Analyze directionality\n", + "upstream = (result[\"distance\"] < 0).sum()\n", + "downstream = (result[\"distance\"] > 0).sum()\n", + "overlap = (result[\"distance\"] == 0).sum()\n", + "\n", + "print(f\"Result: {len(result)} total nearest features\")\n", + "print(f\"Upstream features (distance < 0): {upstream}\")\n", + "print(f\"Downstream features (distance > 0): {downstream}\")\n", + "print(f\"Overlapping features (distance = 0): {overlap}\")\n", + "result.head(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distance-Constrained Queries with max_distance\n", + "\n", + "Find up to **5 nearest features within 50kb** using `max_distance=50000`.\n", + "\n", + "This is useful for finding nearby regulatory elements or associated genes within a biologically relevant distance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transpiled SQL:\n", + "SELECT\n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " distance\n", + "FROM\n", + " features_a AS a,\n", + "\n", + " (SELECT\n", + " features_b.*,\n", + " CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END AS distance\n", + " FROM features_b\n", + " WHERE a.\"chromosome\" = features_b.\"chromosome\"\n", + " AND (ABS(CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END)) <= 1000000\n", + " ORDER BY ABS(CASE\n", + " WHEN a.\"chromosome\" != features_b.\"chromosome\" THEN NULL\n", + " WHEN a.\"start_pos\" < features_b.\"end_pos\"\n", + " AND a.\"end_pos\" > features_b.\"start_pos\" THEN 0\n", + " WHEN a.\"end_pos\" <= features_b.\"start_pos\" THEN (features_b.\"start_pos\" - a.\"end_pos\")\n", + " ELSE (a.\"start_pos\" - features_b.\"end_pos\")\n", + " END)\n", + " LIMIT 5) AS b\n", + "WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + "ORDER BY\n", + " a.chromosome,\n", + " a.start_pos,\n", + " distance\n", + "\n", + "================================================================================\n", + "\n", + "Result: 15 total rows (up to 5 neighbors within 50kb per feature in A)\n", + "Number of features from A: 3\n", + "Average neighbors per feature: 5.0\n", + "Max distance found: 731300 bp (should be <= 50000)\n", + "Features with at least one neighbor: 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromosomea_starta_endb_startb_enddistance
0chr11813681815641866541869645090
1chr118136818156426797926812886415
2chr1181368181564850424850734668860
3chr1181368181564869711870021688147
4chr1181368181564912864913174731300
5chr11866501868461866541869640
6chr118665018684626797926812881133
7chr1186650186846850424850734663578
8chr1186650186846869711870021682865
9chr1186650186846912864913174726018
10chr12679092681052679792681280
11chr126790926810518665418696480945
12chr1267909268105850424850734582319
13chr1267909268105869711870021601606
14chr1267909268105912864913174644759
\n", + "
" + ], + "text/plain": [ + " chromosome a_start a_end b_start b_end distance\n", + "0 chr1 181368 181564 186654 186964 5090\n", + "1 chr1 181368 181564 267979 268128 86415\n", + "2 chr1 181368 181564 850424 850734 668860\n", + "3 chr1 181368 181564 869711 870021 688147\n", + "4 chr1 181368 181564 912864 913174 731300\n", + "5 chr1 186650 186846 186654 186964 0\n", + "6 chr1 186650 186846 267979 268128 81133\n", + "7 chr1 186650 186846 850424 850734 663578\n", + "8 chr1 186650 186846 869711 870021 682865\n", + "9 chr1 186650 186846 912864 913174 726018\n", + "10 chr1 267909 268105 267979 268128 0\n", + "11 chr1 267909 268105 186654 186964 80945\n", + "12 chr1 267909 268105 850424 850734 582319\n", + "13 chr1 267909 268105 869711 870021 601606\n", + "14 chr1 267909 268105 912864 913174 644759" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define GIQL query - find up to 5 nearest features within 50kb\n", + "# max_distance=1000000 filters out features farther than 50kb\n", + "giql_query = \"\"\"\n", + " SELECT \n", + " a.chromosome,\n", + " a.start_pos AS a_start,\n", + " a.end_pos AS a_end,\n", + " b.start_pos AS b_start,\n", + " b.end_pos AS b_end,\n", + " distance\n", + " FROM features_a a, NEAREST(features_b, k=5, max_distance=1000000) b\n", + " WHERE a.chromosome = 'chr1'\n", + " AND a.start_pos < 500000\n", + " ORDER BY a.chromosome, a.start_pos, distance\n", + "\"\"\"\n", + "\n", + "# Transpile to SQL\n", + "sql = engine.transpile(giql_query)\n", + "print(\"Transpiled SQL:\")\n", + "print(\n", + " sqlparse.format(\n", + " sql, reindent=True, keyword_case=\"upper\", indent_columns=True, indent_width=2\n", + " )\n", + ")\n", + "print(\"\\n\" + \"=\" * 80 + \"\\n\")\n", + "\n", + "# Execute with DuckDB via GIQL engine\n", + "cursor = engine.conn.execute(sql)\n", + "result = cursor.df() # Get result as pandas DataFrame\n", + "\n", + "# Analyze distance constraints\n", + "grouped = result.groupby([\"a_start\", \"a_end\"]).size()\n", + "print(\n", + " f\"Result: {len(result)} total rows (up to 5 neighbors within 50kb per feature in A)\"\n", + ")\n", + "print(f\"Number of features from A: {len(grouped)}\")\n", + "print(f\"Average neighbors per feature: {grouped.mean():.1f}\")\n", + "print(f\"Max distance found: {result['distance'].max()} bp (should be <= 50000)\")\n", + "print(f\"Features with at least one neighbor: {len(grouped)}\")\n", + "result.head(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summary\n", + "\n", + "This demo showcased all 7 GIQL operators:\n", + "\n", + "1. **INTERSECTS**: Binary predicate for overlapping intervals\n", + "2. **WITHIN**: Binary predicate for containment (A within B)\n", + "3. **CONTAINS**: Binary predicate for containment (A contains B)\n", + "4. **MERGE**: Aggregation operator to combine overlapping intervals\n", + "5. **CLUSTER**: Aggregation operator to assign cluster IDs to overlapping intervals\n", + "6. **DISTANCE**: UDF operator to calculate genomic distances between intervals\n", + "7. **NEAREST**: Table-valued function for finding k-nearest genomic features\n", + "\n", + "Each operator was:\n", + "- Written in GIQL syntax\n", + "- Transpiled to standard SQL\n", + "- Executed using DuckDB\n", + "\n", + "This demonstrates how GIQL provides a high-level, intuitive syntax for genomic interval operations while maintaining compatibility with standard SQL engines through transpilation." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up\n", + "# engine.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "giql", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..2777a07 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,5 @@ +# Sphinx build outputs +_build/ + +# Generated API documentation +api/generated/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api/index.rst b/docs/api/index.rst new file mode 100644 index 0000000..a17dc9e --- /dev/null +++ b/docs/api/index.rst @@ -0,0 +1,12 @@ +API Reference +============= + +This section documents the GIQL Python API. + +.. toctree:: + :maxdepth: 2 + +.. automodule:: giql + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..1d38676 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,73 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import sys + +sys.path.insert(0, os.path.abspath("../src")) + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "GIQL" +copyright = "2024, GIQL Contributors" +author = "GIQL Contributors" +release = "0.1.0" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", +] + +# Napoleon settings +napoleon_google_docstring = False +napoleon_numpy_docstring = False +napoleon_include_init_with_doc = True +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True +napoleon_preprocess_types = False +napoleon_type_aliases = None +napoleon_attr_annotations = True + +# Autodoc settings +autodoc_default_options = { + "members": True, + "member-order": "bysource", + "special-members": "__init__", + "undoc-members": True, + "exclude-members": "__weakref__", +} +autodoc_typehints = "description" +autodoc_typehints_description_target = "documented" + +# Autosummary settings +autosummary_generate = False + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Intersphinx mapping +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "pandas": ("https://pandas.pydata.org/docs/", None), +} + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +# html_static_path = ['_static'] # Uncomment when you have custom static files diff --git a/docs/guides/index.rst b/docs/guides/index.rst new file mode 100644 index 0000000..c3265be --- /dev/null +++ b/docs/guides/index.rst @@ -0,0 +1,32 @@ +Guides +====== + +Task-oriented guides for working with GIQL. These guides cover common workflows +and best practices for using GIQL effectively. + +.. toctree:: + :maxdepth: 2 + + schema-mapping + multi-backend + performance + transpilation + +Guide Overview +-------------- + +:doc:`schema-mapping` + Learn how to configure GIQL to work with your genomic data, including + registering table schemas and mapping logical genomic columns. + +:doc:`multi-backend` + Understand GIQL's multi-database support and how to work with different + backends like DuckDB, SQLite, and PostgreSQL. + +:doc:`performance` + Optimize your GIQL queries for better performance with indexing strategies, + query patterns, and backend-specific tips. + +:doc:`transpilation` + Understand how GIQL translates queries to SQL, debug query generation, + and integrate transpiled SQL with external tools. diff --git a/docs/guides/multi-backend.rst b/docs/guides/multi-backend.rst new file mode 100644 index 0000000..ecc3799 --- /dev/null +++ b/docs/guides/multi-backend.rst @@ -0,0 +1,367 @@ +Multi-Backend Guide +=================== + +GIQL supports multiple database backends, allowing you to run the same genomic +queries against different database systems. This guide covers backend selection, +configuration, and backend-specific considerations. + +.. contents:: + :local: + :depth: 2 + +Supported Backends +------------------ + +GIQL currently supports the following database backends: + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Status + - Best For + * - DuckDB + - Full Support + - Analytics, large datasets, in-memory processing + * - SQLite + - Full Support + - Lightweight, embedded, portable databases + * - PostgreSQL + - Planned + - Production deployments, shared databases + +Selecting a Backend +------------------- + +DuckDB (Recommended) +~~~~~~~~~~~~~~~~~~~~ + +DuckDB is the recommended backend for most use cases. It provides excellent +performance for analytical queries and handles large genomic datasets efficiently. + +.. code-block:: python + + from giql import GIQLEngine + + # In-memory DuckDB (default) + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("features", "features.bed") + # ... register schemas and query + + # Persistent DuckDB database + with GIQLEngine(target_dialect="duckdb", db_path="my_data.duckdb") as engine: + # Data persists between sessions + pass + +**Advantages:** + +- Fast analytical query performance +- Efficient columnar storage +- Good support for large datasets +- Rich SQL feature set +- In-memory and persistent options + +**Best for:** + +- Interactive analysis +- Large BED/VCF files +- Complex aggregations +- One-time analysis pipelines + +SQLite +~~~~~~ + +SQLite is a lightweight, embedded database suitable for smaller datasets or +when portability is important. + +.. code-block:: python + + # In-memory SQLite + with GIQLEngine(target_dialect="sqlite") as engine: + pass + + # Persistent SQLite database + with GIQLEngine(target_dialect="sqlite", db_path="my_data.db") as engine: + pass + +**Advantages:** + +- Zero configuration +- Single-file database +- Widely compatible +- Small memory footprint + +**Best for:** + +- Small to medium datasets +- Portable analysis +- Embedded applications +- Simple workflows + +Backend Configuration +--------------------- + +In-Memory vs Persistent +~~~~~~~~~~~~~~~~~~~~~~~ + +Both DuckDB and SQLite support in-memory and persistent modes: + +.. code-block:: python + + # In-memory (data lost when engine closes) + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("features", "features.bed") + # Data exists only during this session + + # Persistent (data saved to disk) + with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: + engine.load_csv("features", "features.bed") + # Data persists after engine closes + + # Reopen persistent database + with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: + # Previous data is available + cursor = engine.execute("SELECT * FROM features LIMIT 5") + +Connection Options +~~~~~~~~~~~~~~~~~~ + +Pass additional connection options to the underlying database: + +.. code-block:: python + + # DuckDB with custom settings + with GIQLEngine( + target_dialect="duckdb", + db_path="analysis.duckdb", + read_only=False, + ) as engine: + pass + +Writing Portable Queries +------------------------ + +Query Compatibility +~~~~~~~~~~~~~~~~~~~ + +GIQL queries are portable across backends. The same query works on any +supported database: + +.. code-block:: python + + query = """ + SELECT a.*, b.name AS gene + FROM variants a + JOIN genes b ON a.interval INTERSECTS b.interval + WHERE a.quality >= 30 + """ + + # Works on DuckDB + with GIQLEngine(target_dialect="duckdb") as engine: + # ... setup ... + cursor = engine.execute(query) + + # Same query works on SQLite + with GIQLEngine(target_dialect="sqlite") as engine: + # ... setup ... + cursor = engine.execute(query) + +SQL Dialect Differences +~~~~~~~~~~~~~~~~~~~~~~~ + +While GIQL queries are portable, the generated SQL differs between backends. +Use ``transpile()`` to see the backend-specific SQL: + +.. code-block:: python + + query = "SELECT * FROM features WHERE interval INTERSECTS 'chr1:1000-2000'" + + # DuckDB SQL + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema("features", {...}, genomic_column="interval") + print(engine.transpile(query)) + + # SQLite SQL (may differ slightly) + with GIQLEngine(target_dialect="sqlite") as engine: + engine.register_table_schema("features", {...}, genomic_column="interval") + print(engine.transpile(query)) + +Backend-Specific Features +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some SQL features may only be available on certain backends: + +.. list-table:: + :header-rows: 1 + :widths: 40 20 20 20 + + * - Feature + - DuckDB + - SQLite + - Notes + * - Window functions + - Yes + - Yes + - Full support + * - CTEs (WITH clause) + - Yes + - Yes + - Full support + * - LATERAL joins + - Yes + - Limited + - Used by NEAREST + * - STRING_AGG + - Yes + - GROUP_CONCAT + - Different function names + +Migrating Between Backends +-------------------------- + +Exporting Data +~~~~~~~~~~~~~~ + +Export data from one backend for import into another: + +.. code-block:: python + + # Export from DuckDB + with GIQLEngine(target_dialect="duckdb", db_path="source.duckdb") as engine: + cursor = engine.execute("SELECT * FROM features") + import pandas as pd + df = pd.DataFrame(cursor.fetchall(), + columns=[desc[0] for desc in cursor.description]) + df.to_csv("features_export.csv", index=False) + + # Import to SQLite + with GIQLEngine(target_dialect="sqlite", db_path="target.db") as engine: + engine.load_csv("features", "features_export.csv") + engine.register_table_schema("features", {...}, genomic_column="interval") + +Schema Compatibility +~~~~~~~~~~~~~~~~~~~~ + +Ensure schema definitions work across backends: + +.. code-block:: python + + # Use portable type names + schema = { + "chromosome": "VARCHAR", # Works on all backends + "start_pos": "BIGINT", # Maps to appropriate integer type + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "FLOAT", # Maps to appropriate float type + } + + # Same schema works on both backends + for dialect in ["duckdb", "sqlite"]: + with GIQLEngine(target_dialect=dialect) as engine: + engine.load_csv("features", "features.csv") + engine.register_table_schema("features", schema, genomic_column="interval") + +Performance Comparison +---------------------- + +Backend Performance Characteristics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 30 35 35 + + * - Operation + - DuckDB + - SQLite + * - Large table scans + - Excellent (columnar) + - Good + * - Complex joins + - Excellent + - Good + * - Aggregations + - Excellent + - Good + * - Small queries + - Good + - Excellent + * - Memory usage + - Higher + - Lower + * - Startup time + - Faster + - Fast + +Choosing the Right Backend +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Choose DuckDB when:** + +- Working with large datasets (millions of features) +- Running complex analytical queries +- Performing heavy aggregations +- Memory is not constrained + +**Choose SQLite when:** + +- Working with smaller datasets +- Need maximum portability +- Memory is constrained +- Simple query patterns + +Using External Connections +-------------------------- + +Connecting to Existing Databases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Connect to databases created outside of GIQL: + +.. code-block:: python + + # Connect to existing DuckDB database + with GIQLEngine(target_dialect="duckdb", db_path="existing.duckdb") as engine: + # Register schemas for existing tables + engine.register_table_schema( + "my_existing_table", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + ) + + # Query existing data with GIQL operators + cursor = engine.execute(""" + SELECT * FROM my_existing_table + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +Using Transpiled SQL Externally +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generate SQL for use with external database connections: + +.. code-block:: python + + import duckdb + + # Get transpiled SQL from GIQL + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema("features", {...}, genomic_column="interval") + sql = engine.transpile(""" + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Execute with external connection + conn = duckdb.connect("my_database.duckdb") + result = conn.execute(sql).fetchall() + conn.close() + +This is useful when integrating GIQL with existing database workflows or +when you need more control over the database connection. diff --git a/docs/guides/performance.rst b/docs/guides/performance.rst new file mode 100644 index 0000000..c0c4e51 --- /dev/null +++ b/docs/guides/performance.rst @@ -0,0 +1,414 @@ +Performance Guide +================= + +This guide covers strategies for optimizing GIQL query performance, including +indexing, query patterns, and backend-specific optimizations. + +.. contents:: + :local: + :depth: 2 + +Understanding Query Performance +------------------------------- + +How GIQL Queries Execute +~~~~~~~~~~~~~~~~~~~~~~~~ + +When you execute a GIQL query: + +1. GIQL parses the query and identifies genomic operators +2. Operators are expanded into standard SQL predicates +3. The SQL is sent to the database backend +4. The database executes the query using its optimizer + +Performance depends on both the generated SQL and how the database executes it. + +Common Performance Bottlenecks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **Full table scans**: No indexes to speed up filtering +- **Cartesian products**: Large cross joins without early filtering +- **Missing chromosome filters**: Comparing features across all chromosomes +- **Inefficient join order**: Small tables should drive joins + +Indexing Strategies +------------------- + +Creating Indexes +~~~~~~~~~~~~~~~~ + +Create indexes on genomic columns for faster queries: + +.. code-block:: python + + # DuckDB + engine.conn.execute(""" + CREATE INDEX idx_features_position + ON features (chromosome, start_pos, end_pos) + """) + + # SQLite + engine.conn.execute(""" + CREATE INDEX idx_features_position + ON features (chromosome, start_pos, end_pos) + """) + +Recommended Index Patterns +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**For single-table queries (filtering):** + +.. code-block:: sql + + CREATE INDEX idx_table_position ON table_name (chromosome, start_pos, end_pos) + +**For join queries:** + +.. code-block:: sql + + -- Index both tables involved in joins + CREATE INDEX idx_variants_position ON variants (chromosome, start_pos, end_pos) + CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + +**For strand-specific queries:** + +.. code-block:: sql + + CREATE INDEX idx_features_strand ON features (chromosome, strand, start_pos, end_pos) + +When to Create Indexes +~~~~~~~~~~~~~~~~~~~~~~ + +Create indexes when: + +- Tables have more than ~10,000 rows +- You're running repeated queries on the same tables +- Join queries are slow +- Filtering by genomic position is common + +Skip indexes when: + +- Tables are small +- You're doing one-time analysis +- Full table scans are acceptable + +Query Optimization Patterns +--------------------------- + +Pre-filter by Chromosome +~~~~~~~~~~~~~~~~~~~~~~~~ + +Always include chromosome filtering when joining tables: + +.. code-block:: python + + # Good: Explicit chromosome filter + cursor = engine.execute(""" + SELECT a.*, b.name + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE a.chromosome = 'chr1' + """) + + # Also good: Cross-chromosome join with implicit filtering + # GIQL handles this, but explicit is clearer + cursor = engine.execute(""" + SELECT a.*, b.name + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + AND a.chromosome = b.chromosome + """) + +Use Selective Filters Early +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Apply selective filters before joins: + +.. code-block:: python + + # Good: Filter before joining + cursor = engine.execute(""" + WITH filtered_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ) + SELECT f.*, g.name + FROM filtered_variants f + JOIN genes g ON f.interval INTERSECTS g.interval + """) + + # Less efficient: Filter after joining + cursor = engine.execute(""" + SELECT v.*, g.name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 AND v.filter = 'PASS' + """) + +Limit Result Sets +~~~~~~~~~~~~~~~~~ + +Use LIMIT for exploratory queries: + +.. code-block:: python + + # Good: Limit results during exploration + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' + LIMIT 100 + """) + +Use DISTINCT Wisely +~~~~~~~~~~~~~~~~~~~ + +DISTINCT can be expensive. Only use when necessary: + +.. code-block:: python + + # Only use DISTINCT when you actually need unique rows + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + """) + + # If you just need to check existence, use EXISTS instead + cursor = engine.execute(""" + SELECT a.* + FROM features_a a + WHERE EXISTS ( + SELECT 1 FROM features_b b + WHERE a.interval INTERSECTS b.interval + ) + """) + +NEAREST Query Optimization +-------------------------- + +Optimizing K-NN Queries +~~~~~~~~~~~~~~~~~~~~~~~ + +The NEAREST operator can be expensive for large datasets. Optimize with: + +**1. Use max_distance to limit search space:** + +.. code-block:: python + + # Good: Constrained search + cursor = engine.execute(""" + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 -- Only search within 100kb + ) AS nearest + """) + +**2. Request only the k you need:** + +.. code-block:: python + + # Good: Request exactly what you need + NEAREST(genes, reference=peaks.interval, k=3) + + # Wasteful: Request more than needed + NEAREST(genes, reference=peaks.interval, k=100) + +**3. Index the target table:** + +.. code-block:: sql + + CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + +Merge and Cluster Optimization +------------------------------ + +Efficient Clustering +~~~~~~~~~~~~~~~~~~~~ + +For large datasets, consider pre-sorting: + +.. code-block:: python + + # Pre-sort data for clustering + cursor = engine.execute(""" + WITH sorted AS ( + SELECT * FROM features + ORDER BY chromosome, start_pos + ) + SELECT *, CLUSTER(interval) AS cluster_id + FROM sorted + """) + +Efficient Merging +~~~~~~~~~~~~~~~~~ + +Filter before merging to reduce data volume: + +.. code-block:: python + + # Good: Filter first, then merge + cursor = engine.execute(""" + WITH filtered AS ( + SELECT * FROM features + WHERE score >= 10 + ) + SELECT MERGE(interval), COUNT(*) AS count + FROM filtered + """) + +Analyzing Query Performance +--------------------------- + +Using EXPLAIN +~~~~~~~~~~~~~ + +Analyze query execution plans: + +.. code-block:: python + + # Get the transpiled SQL + sql = engine.transpile(""" + SELECT a.*, b.name + FROM variants a + JOIN genes b ON a.interval INTERSECTS b.interval + """) + + # Analyze the execution plan + cursor = engine.execute(f"EXPLAIN {sql}") + for row in cursor: + print(row) + + # DuckDB also supports EXPLAIN ANALYZE for actual timing + cursor = engine.execute(f"EXPLAIN ANALYZE {sql}") + +Timing Queries +~~~~~~~~~~~~~~ + +Measure query execution time: + +.. code-block:: python + + import time + + start = time.time() + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' + """) + results = cursor.fetchall() + elapsed = time.time() - start + + print(f"Query returned {len(results)} rows in {elapsed:.2f} seconds") + +Backend-Specific Tips +--------------------- + +DuckDB Optimizations +~~~~~~~~~~~~~~~~~~~~ + +**Use columnar strengths:** + +DuckDB is columnar, so queries that select few columns are faster: + +.. code-block:: python + + # Faster: Select only needed columns + cursor = engine.execute(""" + SELECT chromosome, start_pos, end_pos, name + FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Slower: Select all columns + cursor = engine.execute(""" + SELECT * + FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +**Parallel execution:** + +DuckDB automatically parallelizes queries. For very large datasets, +ensure you're not limiting parallelism. + +SQLite Optimizations +~~~~~~~~~~~~~~~~~~~~ + +**Use covering indexes:** + +.. code-block:: sql + + -- Include commonly selected columns in the index + CREATE INDEX idx_features_covering + ON features (chromosome, start_pos, end_pos, name, score) + +**Analyze tables:** + +.. code-block:: python + + # Help SQLite's query planner + engine.conn.execute("ANALYZE features") + +Memory Management +----------------- + +Streaming Results +~~~~~~~~~~~~~~~~~ + +For large result sets, iterate instead of fetching all: + +.. code-block:: python + + # Good: Stream results + cursor = engine.execute("SELECT * FROM large_table") + for row in cursor: + process(row) + + # Memory-intensive: Fetch all at once + cursor = engine.execute("SELECT * FROM large_table") + all_rows = cursor.fetchall() # Loads everything into memory + +Batch Processing +~~~~~~~~~~~~~~~~ + +Process large datasets in batches: + +.. code-block:: python + + chromosomes = ['chr1', 'chr2', 'chr3', ...] # All chromosomes + + for chrom in chromosomes: + cursor = engine.execute(f""" + SELECT * FROM features + WHERE chromosome = '{chrom}' + AND interval INTERSECTS '{chrom}:1-1000000' + """) + process_chromosome(cursor) + +Performance Checklist +--------------------- + +Before running large queries, check: + +.. code-block:: text + + □ Indexes created on genomic columns + □ Chromosome filtering included in joins + □ Selective filters applied early + □ LIMIT used for exploration + □ Only necessary columns selected + □ NEAREST queries use max_distance + □ Results streamed instead of fetched all at once + +Quick Wins +~~~~~~~~~~ + +1. **Add indexes** - Usually the biggest performance improvement +2. **Filter by chromosome** - Reduces join complexity significantly +3. **Use max_distance with NEAREST** - Limits search space +4. **Stream results** - Reduces memory pressure +5. **Use DuckDB** - Generally faster for analytical queries diff --git a/docs/guides/schema-mapping.rst b/docs/guides/schema-mapping.rst new file mode 100644 index 0000000..f515695 --- /dev/null +++ b/docs/guides/schema-mapping.rst @@ -0,0 +1,445 @@ +Schema Mapping Guide +==================== + +This guide explains how to configure GIQL to work with your genomic data by +registering table schemas and mapping logical genomic columns. + +.. contents:: + :local: + :depth: 2 + +Understanding Schema Mapping +---------------------------- + +GIQL needs to know how your genomic data is structured in order to translate +genomic operators into SQL. This is done through schema registration, which +maps a logical "genomic column" (used in your queries) to the physical columns +in your database tables. + +The Core Concept +~~~~~~~~~~~~~~~~ + +In GIQL queries, you use a logical genomic column name like ``interval``: + +.. code-block:: sql + + SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000' + +Behind the scenes, GIQL expands this to actual column comparisons: + +.. code-block:: sql + + SELECT * FROM variants + WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + +Schema registration tells GIQL which physical columns (``chromosome``, +``start_pos``, ``end_pos``) correspond to the logical ``interval`` column. + +Registering Table Schemas +------------------------- + +Basic Registration +~~~~~~~~~~~~~~~~~~ + +Register a table schema using ``register_table_schema()``: + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + # Load data + engine.load_csv("variants", "variants.csv") + + # Register schema + engine.register_table_schema( + "variants", # Table name + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "quality": "FLOAT", + }, + genomic_column="interval", # Logical column name for queries + ) + + # Now you can use 'interval' in queries + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +Required Columns +~~~~~~~~~~~~~~~~ + +For schema registration, your table must have columns that map to: + +- **chromosome**: The chromosome/contig identifier (e.g., 'chr1', 'chrX') +- **start_pos**: The start position of the genomic interval (0-based, inclusive) +- **end_pos**: The end position of the genomic interval (0-based, exclusive) + +GIQL looks for these column names by default. If your columns have different +names, see :ref:`custom-column-names`. + +Optional Strand Column +~~~~~~~~~~~~~~~~~~~~~~ + +If your data includes strand information, include it in the schema: + +.. code-block:: python + + engine.register_table_schema( + "features", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "strand": "VARCHAR", # '+', '-', or '.' + "name": "VARCHAR", + }, + genomic_column="interval", + ) + +The strand column enables strand-specific operations in operators like +CLUSTER and NEAREST. + +.. _custom-column-names: + +Custom Column Names +~~~~~~~~~~~~~~~~~~~ + +If your table uses different column names for genomic coordinates, specify +the mapping explicitly: + +.. code-block:: python + + engine.register_table_schema( + "my_table", + { + "chrom": "VARCHAR", # Your chromosome column + "chromStart": "BIGINT", # Your start column (UCSC-style) + "chromEnd": "BIGINT", # Your end column + "name": "VARCHAR", + }, + genomic_column="interval", + chromosome_column="chrom", # Map to your column name + start_column="chromStart", # Map to your column name + end_column="chromEnd", # Map to your column name + ) + +Multiple Tables +--------------- + +Register Multiple Tables +~~~~~~~~~~~~~~~~~~~~~~~~ + +Register all tables that will participate in genomic queries: + +.. code-block:: python + + with GIQLEngine(target_dialect="duckdb") as engine: + # Load data files + engine.load_csv("variants", "variants.bed") + engine.load_csv("genes", "genes.bed") + engine.load_csv("regulatory", "regulatory.bed") + + # Define common schema + bed_schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "FLOAT", + "strand": "VARCHAR", + } + + # Register each table + for table in ["variants", "genes", "regulatory"]: + engine.register_table_schema( + table, + bed_schema, + genomic_column="interval", + ) + + # Now you can join tables using genomic operators + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + """) + +Different Schemas Per Table +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tables can have different schemas and even different genomic column names: + +.. code-block:: python + + # Variants table with VCF-style columns + engine.register_table_schema( + "variants", + { + "CHROM": "VARCHAR", + "POS": "BIGINT", + "END": "BIGINT", + "ID": "VARCHAR", + "QUAL": "FLOAT", + }, + genomic_column="var_interval", + chromosome_column="CHROM", + start_column="POS", + end_column="END", + ) + + # Genes table with BED-style columns + engine.register_table_schema( + "genes", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "gene_name": "VARCHAR", + "strand": "VARCHAR", + }, + genomic_column="gene_interval", + ) + + # Query using different genomic column names + cursor = engine.execute(""" + SELECT v.ID, g.gene_name + FROM variants v + JOIN genes g ON v.var_interval INTERSECTS g.gene_interval + """) + +Coordinate Systems +------------------ + +Understanding BED Coordinates +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +GIQL uses the BED coordinate convention: + +- **0-based start**: The first base of a chromosome is position 0 +- **Half-open intervals**: Start is inclusive, end is exclusive +- **Interval [start, end)**: Contains positions from start to end-1 + +Example: An interval ``chr1:100-200`` covers bases 100 through 199 (100 bases total). + +Converting from 1-Based Coordinates +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your data uses 1-based coordinates (like VCF or GFF), convert when loading: + +.. code-block:: python + + import pandas as pd + + # Load 1-based data + df = pd.read_csv("variants.vcf", sep="\t") + + # Convert to 0-based + df['start_pos'] = df['POS'] - 1 # Convert 1-based to 0-based + df['end_pos'] = df['POS'] # For SNPs, end = start + 1 + + # Load into engine + engine.conn.execute("CREATE TABLE variants AS SELECT * FROM df") + + # Register schema + engine.register_table_schema( + "variants", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + # ... other columns + }, + genomic_column="interval", + ) + +Working with Point Features +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For point features (like SNPs), create an interval of length 1: + +.. code-block:: python + + # For a SNP at position 1000 (1-based) + # 0-based interval: [999, 1000) + start_pos = 999 + end_pos = 1000 + +Data Types +---------- + +Recommended Column Types +~~~~~~~~~~~~~~~~~~~~~~~~ + +For optimal performance, use appropriate data types: + +.. list-table:: + :header-rows: 1 + :widths: 25 25 50 + + * - Column + - Recommended Type + - Notes + * - chromosome + - VARCHAR + - String type for chromosome names + * - start_pos + - BIGINT + - 64-bit integer for large genomes + * - end_pos + - BIGINT + - 64-bit integer for large genomes + * - strand + - VARCHAR(1) or CHAR(1) + - Single character: '+', '-', '.' + * - score + - FLOAT or DOUBLE + - Numeric scores + * - name + - VARCHAR + - Feature identifiers + +Type Compatibility +~~~~~~~~~~~~~~~~~~ + +GIQL schemas use SQL type names. Common mappings: + +.. list-table:: + :header-rows: 1 + :widths: 30 35 35 + + * - GIQL Schema Type + - DuckDB Type + - SQLite Type + * - INTEGER + - INTEGER + - INTEGER + * - BIGINT + - BIGINT + - INTEGER + * - VARCHAR + - VARCHAR + - TEXT + * - FLOAT + - FLOAT + - REAL + * - DOUBLE + - DOUBLE + - REAL + +Loading Data +------------ + +From CSV Files +~~~~~~~~~~~~~~ + +Load CSV files directly: + +.. code-block:: python + + engine.load_csv("features", "features.csv") + + # With custom options + engine.load_csv( + "features", + "features.tsv", + delimiter="\t", + header=True, + ) + +From Pandas DataFrames +~~~~~~~~~~~~~~~~~~~~~~ + +Load data from pandas: + +.. code-block:: python + + import pandas as pd + + df = pd.read_csv("features.bed", sep="\t", header=None, + names=["chromosome", "start_pos", "end_pos", "name"]) + + # Register the DataFrame as a table + engine.conn.execute("CREATE TABLE features AS SELECT * FROM df") + + # Then register the schema + engine.register_table_schema( + "features", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + ) + +From Existing Database Tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If tables already exist in your database, just register their schemas: + +.. code-block:: python + + # Connect to existing database + with GIQLEngine(target_dialect="duckdb", db_path="my_database.duckdb") as engine: + # Register schemas for existing tables + engine.register_table_schema( + "existing_table", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + ) + + # Query existing data + cursor = engine.execute(""" + SELECT * FROM existing_table + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +Troubleshooting +--------------- + +Common Issues +~~~~~~~~~~~~~ + +**"Unknown column" errors:** + +- Ensure the table schema is registered before querying +- Check that the genomic column name in your query matches the registered name +- Verify column names in the schema match actual table columns + +**Incorrect results:** + +- Verify your coordinate system (0-based vs 1-based) +- Check that start_pos < end_pos for all intervals +- Ensure chromosome names match between tables (e.g., 'chr1' vs '1') + +**Performance issues:** + +- See the :doc:`performance` guide for optimization tips +- Consider adding indexes on genomic columns + +Verifying Schema Registration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Check that schemas are registered correctly: + +.. code-block:: python + + # After registration, test with a simple query + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + print(sql) + # Should show expanded SQL with chromosome, start_pos, end_pos comparisons diff --git a/docs/guides/transpilation.rst b/docs/guides/transpilation.rst new file mode 100644 index 0000000..bd4c24a --- /dev/null +++ b/docs/guides/transpilation.rst @@ -0,0 +1,417 @@ +Transpilation Guide +=================== + +GIQL works by transpiling genomic queries into standard SQL. This guide explains +how transpilation works, how to debug query generation, and how to use transpiled +SQL with external tools. + +.. contents:: + :local: + :depth: 2 + +How Transpilation Works +----------------------- + +The Transpilation Process +~~~~~~~~~~~~~~~~~~~~~~~~~ + +When you write a GIQL query: + +.. code-block:: sql + + SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000' + +GIQL performs these steps: + +1. **Parse**: Parse the SQL to identify GIQL-specific operators +2. **Expand**: Replace genomic operators with standard SQL predicates +3. **Generate**: Produce SQL for the target database dialect + +The result is standard SQL: + +.. code-block:: sql + + SELECT * FROM variants + WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + +Operator Expansion +~~~~~~~~~~~~~~~~~~ + +Each GIQL operator expands to specific SQL patterns: + +**INTERSECTS** expands to range overlap checks: + +.. code-block:: sql + + -- GIQL + a.interval INTERSECTS b.interval + + -- SQL (same chromosome, overlapping ranges) + a.chromosome = b.chromosome + AND a.start_pos < b.end_pos + AND a.end_pos > b.start_pos + +**CONTAINS** expands to containment checks: + +.. code-block:: sql + + -- GIQL + a.interval CONTAINS b.interval + + -- SQL + a.chromosome = b.chromosome + AND a.start_pos <= b.start_pos + AND a.end_pos >= b.end_pos + +**DISTANCE** expands to gap calculations: + +.. code-block:: sql + + -- GIQL + DISTANCE(a.interval, b.interval) + + -- SQL (simplified) + CASE + WHEN a.chromosome != b.chromosome THEN NULL + WHEN a.end_pos <= b.start_pos THEN b.start_pos - a.end_pos + WHEN b.end_pos <= a.start_pos THEN a.start_pos - b.end_pos + ELSE 0 + END + +Using the Transpile Method +-------------------------- + +Basic Transpilation +~~~~~~~~~~~~~~~~~~~ + +Use ``transpile()`` to see generated SQL without executing: + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema( + "variants", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + }, + genomic_column="interval", + ) + + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + print(sql) + # Output: SELECT * FROM variants + # WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + +Transpiling Complex Queries +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Transpilation works with all GIQL features: + +.. code-block:: python + + # Join query + sql = engine.transpile(""" + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + """) + print(sql) + + # NEAREST query + sql = engine.transpile(""" + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest + """) + print(sql) + + # Aggregation query + sql = engine.transpile(""" + SELECT MERGE(interval), COUNT(*) AS count + FROM features + """) + print(sql) + +Debugging with Transpilation +---------------------------- + +Understanding Query Expansion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use transpilation to understand what GIQL does: + +.. code-block:: python + + # See how ANY quantifier expands + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + """) + print(sql) + # Shows the OR conditions for each range + + # See how join conditions expand + sql = engine.transpile(""" + SELECT a.*, b.name + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + """) + print(sql) + # Shows the full range comparison predicates + +Verbose Mode +~~~~~~~~~~~~ + +Enable verbose mode for detailed transpilation information: + +.. code-block:: python + + with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: + engine.register_table_schema("variants", {...}, genomic_column="interval") + + # Transpilation will print detailed information + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Execution also shows transpilation details + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +Troubleshooting Transpilation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Query not expanding correctly:** + +.. code-block:: python + + # Check that schema is registered + sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") + if "interval INTERSECTS" in sql: + print("Schema not registered for 'variants' table") + +**Wrong column names in output:** + +.. code-block:: python + + # Verify column mapping + engine.register_table_schema( + "variants", + {...}, + genomic_column="interval", + chromosome_column="chrom", # Check these match your table + start_column="start", + end_column="end", + ) + +Comparing Dialects +------------------ + +Same Query, Different SQL +~~~~~~~~~~~~~~~~~~~~~~~~~ + +See how the same query translates for different backends: + +.. code-block:: python + + query = """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """ + + schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + } + + # DuckDB + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema("variants", schema, genomic_column="interval") + print("DuckDB SQL:") + print(engine.transpile(query)) + print() + + # SQLite + with GIQLEngine(target_dialect="sqlite") as engine: + engine.register_table_schema("variants", schema, genomic_column="interval") + print("SQLite SQL:") + print(engine.transpile(query)) + +Dialect-Specific Differences +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some queries may generate different SQL for different dialects: + +- String functions may use different names +- Type casting syntax may vary +- Window function support may differ + +GIQL handles these differences automatically, but understanding them helps +when debugging or integrating with external tools. + +Using Transpiled SQL Externally +------------------------------- + +With External Database Connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use transpiled SQL with your own database connections: + +.. code-block:: python + + import duckdb + + # Generate SQL using GIQL + with GIQLEngine(target_dialect="duckdb") as giql_engine: + giql_engine.register_table_schema("variants", {...}, genomic_column="interval") + sql = giql_engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Execute with external connection + conn = duckdb.connect("my_database.duckdb") + result = conn.execute(sql).fetchall() + conn.close() + +With ORMs and Query Builders +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Integrate transpiled SQL with SQLAlchemy or other ORMs: + +.. code-block:: python + + from sqlalchemy import create_engine, text + + # Generate SQL + with GIQLEngine(target_dialect="duckdb") as giql_engine: + giql_engine.register_table_schema("variants", {...}, genomic_column="interval") + sql = giql_engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Execute with SQLAlchemy + sa_engine = create_engine("duckdb:///my_database.duckdb") + with sa_engine.connect() as conn: + result = conn.execute(text(sql)) + for row in result: + print(row) + +Building SQL Pipelines +~~~~~~~~~~~~~~~~~~~~~~ + +Use transpilation in data pipelines: + +.. code-block:: python + + def build_intersection_query(table_a, table_b, region): + """Generate SQL for intersection query.""" + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema(table_a, {...}, genomic_column="interval") + engine.register_table_schema(table_b, {...}, genomic_column="interval") + + return engine.transpile(f""" + SELECT a.*, b.name + FROM {table_a} a + JOIN {table_b} b ON a.interval INTERSECTS b.interval + WHERE a.interval INTERSECTS '{region}' + """) + + # Use in pipeline + sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000") + # Execute sql with your preferred method + +Saving Queries +~~~~~~~~~~~~~~ + +Save transpiled SQL for documentation or reuse: + +.. code-block:: python + + # Generate and save SQL + with GIQLEngine(target_dialect="duckdb") as engine: + engine.register_table_schema("variants", {...}, genomic_column="interval") + + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + with open("query.sql", "w") as f: + f.write(sql) + + # Later, execute saved SQL + with open("query.sql") as f: + sql = f.read() + + conn = duckdb.connect("database.duckdb") + result = conn.execute(sql).fetchall() + +Advanced Transpilation +---------------------- + +Parameterized Queries +~~~~~~~~~~~~~~~~~~~~~ + +Build queries with parameters: + +.. code-block:: python + + def query_region(engine, chrom, start, end): + """Query a parameterized region.""" + region = f"{chrom}:{start}-{end}" + return engine.execute(f""" + SELECT * FROM variants + WHERE interval INTERSECTS '{region}' + """) + + # Use with different regions + cursor = query_region(engine, "chr1", 1000000, 2000000) + cursor = query_region(engine, "chr2", 5000000, 6000000) + +Dynamic Query Building +~~~~~~~~~~~~~~~~~~~~~~ + +Build queries programmatically: + +.. code-block:: python + + def build_multi_table_query(tables, target_region): + """Build a query that unions results from multiple tables.""" + union_parts = [] + for table in tables: + union_parts.append(f""" + SELECT *, '{table}' AS source FROM {table} + WHERE interval INTERSECTS '{target_region}' + """) + + query = " UNION ALL ".join(union_parts) + return engine.transpile(query) + +Inspecting the AST +~~~~~~~~~~~~~~~~~~ + +For advanced debugging, you can inspect the parsed query: + +.. code-block:: python + + # GIQL uses sqlglot internally + # The transpiled SQL shows the final result + sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") + + # For deep debugging, examine the generated SQL structure + print(sql) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..c00a0be --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,147 @@ +GIQL - Genomic Interval Query Language +====================================== + +**GIQL** is a SQL dialect for genomic range queries with multi-database support. + +GIQL extends SQL with spatial operators for genomic interval queries. It transpiles +to standard SQL that works across multiple database backends including DuckDB and SQLite. + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started + + quickstart + +.. toctree:: + :maxdepth: 2 + :caption: Operator Reference + + operators/index + +.. toctree:: + :maxdepth: 2 + :caption: Guides + + guides/index + +.. toctree:: + :maxdepth: 2 + :caption: Recipes + + recipes/index + +.. toctree:: + :maxdepth: 2 + :caption: Reference + + reference/operator-matrix + reference/syntax-reference + reference/changelog + api/index + +Quick Start +----------- + +Install GIQL: + +.. code-block:: bash + + pip install giql + +Basic usage: + +.. code-block:: python + + from giql import GIQLEngine + + # Create engine with DuckDB backend + with GIQLEngine(target_dialect="duckdb") as engine: + # Load genomic data + engine.load_csv("variants", "variants.csv") + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + }, + genomic_column="interval", + ) + + # Query with genomic operators (returns cursor for streaming) + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Process results + for row in cursor: + print(row) + + # Or just transpile to SQL without executing + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + print(sql) # See the generated SQL + +Features +-------- + +* **SQL-based**: Familiar SQL syntax with genomic extensions +* **Multi-backend**: Works with DuckDB, SQLite, and more +* **Spatial operators**: INTERSECTS, CONTAINS, WITHIN, DISTANCE, NEAREST +* **Aggregation operators**: CLUSTER, MERGE for combining intervals +* **Set quantifiers**: ANY, ALL for multi-range queries +* **Column-to-column joins**: Join tables on genomic position +* **Transpilation**: Convert GIQL to standard SQL for debugging or external use + +Operators at a Glance +--------------------- + +**Spatial Relationships:** + +.. code-block:: sql + + -- Find overlapping features + WHERE interval INTERSECTS 'chr1:1000-2000' + + -- Find containing/contained features + WHERE gene.interval CONTAINS variant.interval + +**Distance and Proximity:** + +.. code-block:: sql + + -- Calculate distance between intervals + SELECT DISTANCE(a.interval, b.interval) AS dist + + -- Find k-nearest neighbors + FROM peaks CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) + +**Aggregation:** + +.. code-block:: sql + + -- Cluster overlapping intervals + SELECT *, CLUSTER(interval) AS cluster_id FROM features + + -- Merge overlapping intervals + SELECT MERGE(interval) FROM features + +**Set Quantifiers:** + +.. code-block:: sql + + -- Match any of multiple regions + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + +See :doc:`operators/index` for complete operator documentation. + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/operators/aggregation-operators.rst b/docs/operators/aggregation-operators.rst new file mode 100644 index 0000000..50d10da --- /dev/null +++ b/docs/operators/aggregation-operators.rst @@ -0,0 +1,402 @@ +Aggregation Operators +===================== + +Aggregation operators combine and cluster genomic intervals. These operators are +essential for reducing complex interval data into summarized regions, such as +merging overlapping peaks or identifying clusters of related features. + +.. contents:: + :local: + :depth: 2 + +.. _cluster-operator: + +CLUSTER +------- + +Assign cluster IDs to overlapping or nearby genomic intervals. + +Description +~~~~~~~~~~~ + +The ``CLUSTER`` operator assigns a unique cluster identifier to groups of intervals +that overlap or are within a specified distance of each other. Intervals in the same +cluster share a common cluster ID, while non-overlapping intervals receive different +IDs. + +This is useful for: + +- Grouping overlapping features +- Identifying regions of high feature density +- Preparing data for downstream merge operations + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Basic clustering (overlapping intervals) + CLUSTER(interval) AS cluster_id + + -- Clustering with distance parameter + CLUSTER(interval, distance) AS cluster_id + + -- Strand-specific clustering + CLUSTER(interval, stranded=true) AS cluster_id + + -- Combined parameters + CLUSTER(interval, distance, stranded=true) AS cluster_id + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**distance** *(optional)* + Maximum gap between intervals to consider them part of the same cluster. + Default: ``0`` (only overlapping intervals are clustered). + +**stranded** *(optional)* + When ``true``, only cluster intervals on the same strand. Default: ``false``. + +Return Value +~~~~~~~~~~~~ + +Integer cluster ID. Intervals in the same cluster have the same ID. +IDs are assigned per-chromosome (and per-strand if ``stranded=true``). + +Examples +~~~~~~~~ + +**Basic Clustering:** + +Assign cluster IDs to overlapping intervals: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +**Distance-Based Clustering:** + +Cluster intervals within 1000bp of each other: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +**Strand-Specific Clustering:** + +Cluster intervals separately by strand: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chromosome, strand, start_pos + """) + +**Analyze Cluster Statistics:** + +Count features per cluster: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + chromosome, + cluster_id, + COUNT(*) AS feature_count, + MIN(start_pos) AS cluster_start, + MAX(end_pos) AS cluster_end + FROM clustered + GROUP BY chromosome, cluster_id + ORDER BY chromosome, cluster_start + """) + +**Filter by Cluster Size:** + +Find regions with multiple overlapping features: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - Efficient window function implementation + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- Data should be sorted by chromosome and position for efficient clustering +- For large datasets, consider partitioning by chromosome +- Cluster IDs are computed using window functions, which scale well + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`MERGE ` - Combine clustered intervals into single regions +- :ref:`INTERSECTS ` - Test for overlap between specific pairs + +---- + +.. _merge-operator: + +MERGE +----- + +Combine overlapping genomic intervals into unified regions. + +Description +~~~~~~~~~~~ + +The ``MERGE`` operator combines overlapping (or nearby) intervals into single, +non-overlapping regions. This is useful for: + +- Creating consensus regions from overlapping features +- Reducing redundant annotations +- Calculating total coverage + +The operator works as an aggregate function, returning one row per merged region +with the unified coordinates. + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Basic merge + SELECT MERGE(interval) FROM features + + -- Merge with distance parameter + SELECT MERGE(interval, distance) FROM features + + -- Strand-specific merge + SELECT MERGE(interval, stranded=true) FROM features + + -- Merge with additional aggregations + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score + FROM features + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**distance** *(optional)* + Maximum gap between intervals to merge. Default: ``0`` (only overlapping + intervals are merged). + +**stranded** *(optional)* + When ``true``, merge intervals separately by strand. Default: ``false``. + +Return Value +~~~~~~~~~~~~ + +Returns merged interval coordinates: + +- ``chromosome`` - Chromosome of the merged region +- ``start_pos`` - Start position of the merged region +- ``end_pos`` - End position of the merged region +- ``strand`` - Strand (if ``stranded=true``) + +Examples +~~~~~~~~ + +**Basic Merge:** + +Merge all overlapping intervals: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval) + FROM features + """) + + # Returns: chromosome, start_pos, end_pos for each merged region + +**Distance-Based Merge:** + +Merge intervals within 1000bp of each other: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, 1000) + FROM features + """) + +**Strand-Specific Merge:** + +Merge intervals separately by strand: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, stranded=true) + FROM features + """) + +**Merge with Feature Count:** + +Count how many features were merged into each region: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features + """) + +**Merge with Aggregations:** + +Calculate statistics for merged regions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM features + """) + +**Collect Merged Feature Names:** + +List the names of features that were merged: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features + """) + +**Merge by Chromosome:** + +Process each chromosome separately (explicit grouping): + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + chromosome, + MERGE(interval), + COUNT(*) AS feature_count + FROM features + GROUP BY chromosome + ORDER BY chromosome + """) + +**Calculate Total Coverage:** + +Calculate the total base pairs covered after merging: + +.. code-block:: python + + cursor = engine.execute(""" + WITH merged AS ( + SELECT MERGE(interval) AS merged_pos + FROM features + ) + SELECT SUM(end_pos - start_pos) AS total_coverage + FROM merged + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- MERGE is an aggregate operation that processes all matching rows +- For very large datasets, consider filtering by chromosome first +- The operation sorts data internally, so pre-sorting is not required + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`CLUSTER ` - Assign cluster IDs without merging +- :ref:`INTERSECTS ` - Test for overlap between specific pairs diff --git a/docs/operators/distance-operators.rst b/docs/operators/distance-operators.rst new file mode 100644 index 0000000..7ceccf3 --- /dev/null +++ b/docs/operators/distance-operators.rst @@ -0,0 +1,393 @@ +Distance and Proximity Operators +================================ + +Distance and proximity operators calculate genomic distances and find nearest features. +These operators are essential for proximity analysis, such as finding genes near +regulatory elements or variants near transcription start sites. + +.. contents:: + :local: + :depth: 2 + +.. _distance-operator: + +DISTANCE +-------- + +Calculate the genomic distance between two intervals. + +Description +~~~~~~~~~~~ + +The ``DISTANCE`` operator returns the number of base pairs separating two genomic +intervals. It follows standard genomic distance conventions: + +- **Overlapping intervals**: Returns ``0`` +- **Non-overlapping intervals**: Returns the gap in base pairs (positive integer) +- **Different chromosomes**: Returns ``NULL`` + +Syntax +~~~~~~ + +.. code-block:: sql + + DISTANCE(interval_a, interval_b) + +Parameters +~~~~~~~~~~ + +**interval_a** + A genomic column registered with the engine. + +**interval_b** + Another genomic column to measure distance to. + +Return Value +~~~~~~~~~~~~ + +- ``0`` for overlapping intervals +- Positive integer (gap in base pairs) for non-overlapping same-chromosome intervals +- ``NULL`` for intervals on different chromosomes + +Examples +~~~~~~~~ + +**Calculate Distances Between Features:** + +Calculate distance between peaks and genes: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS distance + FROM peaks p + CROSS JOIN genes g + WHERE p.chromosome = g.chromosome + ORDER BY p.name, distance + """) + +**Filter by Distance:** + +Find features within 10kb of each other: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chromosome = b.chromosome + AND DISTANCE(a.interval, b.interval) <= 10000 + """) + +**Identify Overlapping vs. Proximal:** + +Distinguish between overlapping and nearby features: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + p.name, + g.name, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chromosome = g.chromosome + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- Always include ``WHERE a.chromosome = b.chromosome`` to avoid unnecessary + cross-chromosome comparisons +- For large datasets, consider pre-filtering by region before calculating distances +- Create indexes on chromosome and position columns for better performance + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`NEAREST ` - Find k-nearest features (uses distance internally) +- :ref:`INTERSECTS ` - Alternative for checking overlap (returns boolean) + +---- + +.. _nearest-operator: + +NEAREST +------- + +Find the k-nearest genomic features to a reference point or interval. + +Description +~~~~~~~~~~~ + +The ``NEAREST`` operator performs k-nearest neighbor (k-NN) queries on genomic data. +It finds the closest features from a target table relative to a reference position, +supporting various filtering options including strand awareness and distance constraints. + +This operator uses ``CROSS JOIN LATERAL`` syntax to efficiently find nearest neighbors +for each row in the driving table. + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Find k nearest features for each row + SELECT * + FROM source_table + CROSS JOIN LATERAL NEAREST( + target_table, + reference=source_table.interval, + k=5 + ) AS nearest + + -- With additional parameters + NEAREST( + target_table, + reference=interval, + k=5, + max_distance=100000, + stranded=true, + signed=true + ) + + -- Standalone query with literal reference + SELECT * FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + +Parameters +~~~~~~~~~~ + +**target_table** + The table to search for nearest features. + +**reference** + The reference position to measure distances from. Can be a column reference + (e.g., ``peaks.interval``) or a literal range (e.g., ``'chr1:1000-2000'``). + +**k** + The number of nearest neighbors to return. Default: ``1``. + +**max_distance** *(optional)* + Maximum distance threshold. Only features within this distance are returned. + +**stranded** *(optional)* + When ``true``, only consider features on the same strand. Default: ``false``. + +**signed** *(optional)* + When ``true``, return signed distances (negative = upstream, positive = downstream). + Default: ``false``. + +Return Value +~~~~~~~~~~~~ + +Returns rows from the target table with an additional ``distance`` column indicating +the distance to the reference position. Results are ordered by distance (closest first). + +Examples +~~~~~~~~ + +**Find K Nearest Genes:** + +Find the 3 nearest genes for each peak: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Standalone Query:** + +Find 5 nearest genes to a specific genomic location: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT gene_name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance + """) + +**Distance-Constrained Search:** + +Find nearest features within 100kb: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Strand-Specific Nearest Neighbors:** + +Find nearest same-strand features: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Directional (Upstream/Downstream) Queries:** + +Find upstream features using signed distances: + +.. code-block:: python + + # Upstream features have negative distances + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC + """) + + # Downstream features have positive distances + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance + """) + +**Combined Parameters:** + +Find nearby same-strand features within distance constraints: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - Efficient lateral join support + * - SQLite + - Partial + - Works but slower for large k values + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- **Chromosome pre-filtering**: NEAREST automatically filters by chromosome for efficiency +- **Use max_distance**: Specifying a maximum distance reduces the search space significantly +- **Limit k**: Only request as many neighbors as you actually need +- **Create indexes**: Add indexes on ``(chromosome, start_pos, end_pos)`` for better performance + +.. code-block:: python + + # Create indexes for better NEAREST performance + engine.conn.execute(""" + CREATE INDEX idx_genes_position + ON genes (chromosome, start_pos, end_pos) + """) + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`DISTANCE ` - Calculate distance between specific pairs +- :ref:`INTERSECTS ` - Find overlapping features (distance = 0) diff --git a/docs/operators/index.rst b/docs/operators/index.rst new file mode 100644 index 0000000..ce24f17 --- /dev/null +++ b/docs/operators/index.rst @@ -0,0 +1,114 @@ +GIQL Operators +============== + +GIQL extends SQL with operators specifically designed for genomic interval queries. +These operators enable powerful spatial reasoning over genomic coordinates without +requiring complex SQL expressions. + +Operators are organized by functionality: + +.. contents:: + :local: + :depth: 1 + +Spatial Relationship Operators +------------------------------ + +Test positional relationships between genomic ranges. + +.. list-table:: + :header-rows: 1 + :widths: 20 50 30 + + * - Operator + - Description + - Example + * - :ref:`INTERSECTS ` + - Returns true when ranges overlap by at least one base pair + - ``interval INTERSECTS 'chr1:1000-2000'`` + * - :ref:`CONTAINS ` + - Returns true when one range fully contains another + - ``interval CONTAINS 'chr1:1500'`` + * - :ref:`WITHIN ` + - Returns true when one range is fully within another + - ``interval WITHIN 'chr1:1000-5000'`` + +See :doc:`spatial-operators` for detailed documentation. + +Distance and Proximity Operators +-------------------------------- + +Calculate distances and find nearest features. + +.. list-table:: + :header-rows: 1 + :widths: 20 50 30 + + * - Operator + - Description + - Example + * - :ref:`DISTANCE ` + - Calculate genomic distance between two intervals + - ``DISTANCE(a.interval, b.interval)`` + * - :ref:`NEAREST ` + - Find k-nearest genomic features + - ``NEAREST(genes, reference=peaks.interval, k=5)`` + +See :doc:`distance-operators` for detailed documentation. + +Aggregation Operators +--------------------- + +Combine and cluster genomic intervals. + +.. list-table:: + :header-rows: 1 + :widths: 20 50 30 + + * - Operator + - Description + - Example + * - :ref:`CLUSTER ` + - Assign cluster IDs to overlapping intervals + - ``CLUSTER(interval) AS cluster_id`` + * - :ref:`MERGE ` + - Combine overlapping intervals into unified regions + - ``SELECT MERGE(interval) FROM features`` + +See :doc:`aggregation-operators` for detailed documentation. + +Set Quantifiers +--------------- + +Apply operators to multiple ranges simultaneously. + +.. list-table:: + :header-rows: 1 + :widths: 20 50 30 + + * - Quantifier + - Description + - Example + * - :ref:`ANY ` + - Match if condition holds for any of the specified ranges + - ``interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000')`` + * - :ref:`ALL ` + - Match if condition holds for all of the specified ranges + - ``interval CONTAINS ALL('chr1:1500', 'chr1:1600')`` + +See :doc:`quantifiers` for detailed documentation. + +Operator Compatibility +---------------------- + +All operators work across supported database backends (DuckDB, SQLite, with PostgreSQL planned). +Each operator page includes a compatibility table showing backend support status. + +.. toctree:: + :maxdepth: 2 + :hidden: + + spatial-operators + distance-operators + aggregation-operators + quantifiers diff --git a/docs/operators/quantifiers.rst b/docs/operators/quantifiers.rst new file mode 100644 index 0000000..cffb71d --- /dev/null +++ b/docs/operators/quantifiers.rst @@ -0,0 +1,332 @@ +Set Quantifiers +=============== + +Set quantifiers extend spatial operators to work with multiple ranges simultaneously. +They allow you to test whether a genomic position matches any or all of a set of +specified ranges in a single query. + +.. contents:: + :local: + :depth: 2 + +.. _any-quantifier: + +ANY +--- + +Match if the condition holds for any of the specified ranges. + +Description +~~~~~~~~~~~ + +The ``ANY`` quantifier tests whether a genomic position satisfies a spatial +relationship with at least one range from a provided set. It acts as a logical +OR across multiple range comparisons. + +This is useful for: + +- Filtering features that overlap any of several regions of interest +- Checking membership in a set of genomic windows +- Multi-region queries without complex OR clauses + +Syntax +~~~~~~ + +.. code-block:: sql + + -- With INTERSECTS + interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000', 'chr2:1000-3000') + + -- With CONTAINS + interval CONTAINS ANY('chr1:1500', 'chr1:2500') + + -- With WITHIN + interval WITHIN ANY('chr1:0-10000', 'chr2:0-10000') + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**ranges** + A comma-separated list of genomic range literals. + +Return Value +~~~~~~~~~~~~ + +Boolean: ``true`` if the spatial condition holds for at least one of the specified +ranges, ``false`` otherwise. + +Examples +~~~~~~~~ + +**Match Multiple Regions:** + +Find variants in any of several regions of interest: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000-2000', + 'chr1:5000-6000', + 'chr2:1000-3000' + ) + """) + +**Check Against Gene Promoters:** + +Find features overlapping any of a set of promoter regions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM peaks + WHERE interval INTERSECTS ANY( + 'chr1:11869-12869', -- Gene A promoter + 'chr1:29554-30554', -- Gene B promoter + 'chr1:69091-70091' -- Gene C promoter + ) + """) + +**Combine with Other Filters:** + +Filter by multiple regions and additional criteria: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + AND quality >= 30 + AND filter = 'PASS' + """) + +**Multi-Chromosome Query:** + +Query across different chromosomes efficiently: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval INTERSECTS ANY( + 'chr1:100000-200000', + 'chr2:100000-200000', + 'chr3:100000-200000', + 'chrX:100000-200000' + ) + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- ``ANY`` expands to multiple OR conditions in the generated SQL +- For very large sets of ranges, consider using a separate table and JOIN instead +- The optimizer may benefit from indexes on chromosome and position columns + +Related +~~~~~~~ + +- :ref:`ALL ` - Match all ranges (logical AND) +- :ref:`INTERSECTS ` - Base spatial operator + +---- + +.. _all-quantifier: + +ALL +--- + +Match if the condition holds for all of the specified ranges. + +Description +~~~~~~~~~~~ + +The ``ALL`` quantifier tests whether a genomic position satisfies a spatial +relationship with every range in a provided set. It acts as a logical AND +across multiple range comparisons. + +This is useful for: + +- Finding features that span multiple specific positions +- Ensuring complete coverage of a set of points +- Strict multi-point containment queries + +Syntax +~~~~~~ + +.. code-block:: sql + + -- With CONTAINS + interval CONTAINS ALL('chr1:1500', 'chr1:1600', 'chr1:1700') + + -- With INTERSECTS (less common, but valid) + interval INTERSECTS ALL('chr1:1000-1100', 'chr1:1050-1150') + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**ranges** + A comma-separated list of genomic range literals. + +Return Value +~~~~~~~~~~~~ + +Boolean: ``true`` if the spatial condition holds for all of the specified +ranges, ``false`` otherwise. + +Examples +~~~~~~~~ + +**Find Features Containing Multiple Points:** + +Find genes that contain all specified SNP positions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) + """) + +**Ensure Complete Coverage:** + +Find intervals that span a set of required positions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval CONTAINS ALL( + 'chr1:10000', + 'chr1:15000', + 'chr1:20000' + ) + """) + +**Find Overlapping Regions:** + +Find features that overlap with all specified windows (useful for finding +features in the intersection of multiple regions): + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval INTERSECTS ALL( + 'chr1:1000-2000', + 'chr1:1500-2500' + ) + """) + + # This finds features that overlap BOTH ranges + # (i.e., features in the intersection: chr1:1500-2000) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- ``ALL`` expands to multiple AND conditions in the generated SQL +- Queries with ``ALL`` may be more restrictive, potentially reducing result sets +- Consider whether ``ANY`` might be more appropriate for your use case + +Related +~~~~~~~ + +- :ref:`ANY ` - Match any range (logical OR) +- :ref:`CONTAINS ` - Base containment operator + +Choosing Between ANY and ALL +---------------------------- + +Use **ANY** when you want to find features that match at least one of several criteria: + +.. code-block:: python + + # Find variants in gene A OR gene B OR gene C + WHERE interval INTERSECTS ANY('gene_a_region', 'gene_b_region', 'gene_c_region') + +Use **ALL** when you want to find features that satisfy all criteria simultaneously: + +.. code-block:: python + + # Find features that contain ALL of these positions + WHERE interval CONTAINS ALL('pos1', 'pos2', 'pos3') + +Common Patterns +--------------- + +**Exclusion with ANY:** + +Find features that don't overlap any blacklisted region: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:1000000-2000000', -- Centromere + 'chr1:5000000-5500000' -- Known artifact region + ) + """) + +**Combining ANY and ALL:** + +Complex queries can combine both quantifiers: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') + """) diff --git a/docs/operators/spatial-operators.rst b/docs/operators/spatial-operators.rst new file mode 100644 index 0000000..6b48001 --- /dev/null +++ b/docs/operators/spatial-operators.rst @@ -0,0 +1,361 @@ +Spatial Relationship Operators +============================== + +Spatial relationship operators test positional relationships between genomic ranges. +These are the core operators for determining whether genomic intervals overlap, +contain, or are contained within other intervals. + +.. contents:: + :local: + :depth: 2 + +.. _intersects-operator: + +INTERSECTS +---------- + +Returns true when two genomic ranges overlap by at least one base pair. + +Description +~~~~~~~~~~~ + +The ``INTERSECTS`` operator is the most commonly used spatial operator. It tests +whether two genomic intervals share any overlapping bases. Two intervals intersect +if they are on the same chromosome and their coordinate ranges overlap. + +Mathematically, intervals ``[start_a, end_a)`` and ``[start_b, end_b)`` intersect when: + +- They are on the same chromosome, AND +- ``start_a < end_b`` AND ``start_b < end_a`` + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Compare against a literal range + interval INTERSECTS 'chr1:1000-2000' + + -- Compare against another genomic column (joins) + a.interval INTERSECTS b.interval + + -- With set quantifiers + interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine via ``register_table_schema()``. + +**literal_range** + A string literal specifying a genomic range in the format ``'chromosome:start-end'``. + +**other_interval** + Another genomic column from the same or different table (for joins). + +Return Value +~~~~~~~~~~~~ + +Boolean: ``true`` if the ranges overlap, ``false`` otherwise. + +Examples +~~~~~~~~ + +**Basic Usage:** + +Find all variants that overlap a specific genomic region: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +**Column-to-Column Joins:** + +Find variants that overlap with any gene: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + """) + +**With WHERE Clause:** + +Find overlapping features with additional filtering: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' + """) + +**Left Outer Join:** + +Find all variants, with gene information where available: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - Targeted for future release + +Performance Notes +~~~~~~~~~~~~~~~~~ + +- Create indexes on ``(chromosome, start_pos, end_pos)`` for better join performance +- When joining large tables, consider filtering by chromosome first +- The generated SQL uses efficient range comparison predicates + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`CONTAINS ` - Check if one range fully contains another +- :ref:`WITHIN ` - Check if one range is fully within another +- :ref:`DISTANCE ` - Calculate distance between non-overlapping ranges + +---- + +.. _contains-operator: + +CONTAINS +-------- + +Returns true when one genomic range fully contains another. + +Description +~~~~~~~~~~~ + +The ``CONTAINS`` operator tests whether one genomic interval completely encompasses +another. The containing interval must cover the entire span of the contained interval. + +Mathematically, interval ``[start_a, end_a)`` contains ``[start_b, end_b)`` when: + +- They are on the same chromosome, AND +- ``start_a <= start_b`` AND ``end_a >= end_b`` + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Check if interval contains a point + interval CONTAINS 'chr1:1500' + + -- Check if interval contains a range + interval CONTAINS 'chr1:1200-1800' + + -- Column-to-column comparison + gene.interval CONTAINS variant.interval + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**literal_range** + A string literal specifying a genomic point or range. + +**other_interval** + Another genomic column for comparisons. + +Return Value +~~~~~~~~~~~~ + +Boolean: ``true`` if the first range fully contains the second, ``false`` otherwise. + +Examples +~~~~~~~~ + +**Point Containment:** + +Find genes that contain a specific position: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM genes + WHERE interval CONTAINS 'chr1:1500' + """) + +**Range Containment:** + +Find large features that fully contain smaller features: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT g.name AS gene_name, e.name AS exon_name + FROM genes g + INNER JOIN exons e ON g.interval CONTAINS e.interval + """) + +**Filtering Fully Contained Variants:** + +Find variants that are completely within gene boundaries: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.* + FROM variants v + INNER JOIN genes g ON g.interval CONTAINS v.interval + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`WITHIN ` - Inverse of CONTAINS +- :ref:`INTERSECTS ` - Partial overlap (less strict) + +---- + +.. _within-operator: + +WITHIN +------ + +Returns true when one genomic range is fully contained within another. + +Description +~~~~~~~~~~~ + +The ``WITHIN`` operator is the inverse of ``CONTAINS``. It tests whether a genomic +interval falls completely inside another interval. + +Mathematically, interval ``[start_a, end_a)`` is within ``[start_b, end_b)`` when: + +- They are on the same chromosome, AND +- ``start_a >= start_b`` AND ``end_a <= end_b`` + +Syntax +~~~~~~ + +.. code-block:: sql + + -- Check if interval is within a range + interval WITHIN 'chr1:1000-5000' + + -- Column-to-column comparison + variant.interval WITHIN gene.interval + +Parameters +~~~~~~~~~~ + +**interval** + A genomic column registered with the engine. + +**literal_range** + A string literal specifying the containing range. + +**other_interval** + Another genomic column for comparisons. + +Return Value +~~~~~~~~~~~~ + +Boolean: ``true`` if the first range is fully within the second, ``false`` otherwise. + +Examples +~~~~~~~~ + +**Filter to Region:** + +Find all features within a specific genomic window: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval WITHIN 'chr1:1000000-2000000' + """) + +**Find Nested Features:** + +Find exons that are completely within their parent gene: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT e.*, g.name AS gene_name + FROM exons e + INNER JOIN genes g ON e.interval WITHIN g.interval + """) + +Backend Compatibility +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Support + - Notes + * - DuckDB + - Full + - + * - SQLite + - Full + - + * - PostgreSQL + - Planned + - + +Related Operators +~~~~~~~~~~~~~~~~~ + +- :ref:`CONTAINS ` - Inverse of WITHIN +- :ref:`INTERSECTS ` - Partial overlap (less strict) diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000..9560c34 --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,228 @@ +Quick Start +=========== + +Installation +------------ + +Install GIQL using pip: + +.. code-block:: bash + + pip install giql + +Or with optional dependencies: + +.. code-block:: bash + + pip install giql[duckdb] # For DuckDB support + +Basic Usage +----------- + +Expected Schema +~~~~~~~~~~~~~~~ + +GIQL works with genomic data stored in tables with separate columns for chromosome, +start position, and end position. The typical schema includes: + +* **chromosome**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') +* **start_pos**: Start position of the genomic interval (0-based, inclusive) +* **end_pos**: End position of the genomic interval (0-based, exclusive, half-open) +* **strand** (optional): Strand orientation ('+', '-', or '.') + +You must register the table schema with GIQL, mapping the logical genomic column +(used in queries) to the physical columns in your table: + +.. code-block:: python + + engine.register_table_schema( + "table_name", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "strand": "VARCHAR", # Optional + # ... other columns ... + }, + genomic_column="interval", # Logical name used in queries + ) + +After registration, you can use ``interval`` in your GIQL queries, and the engine +will automatically map it to the ``chromosome``, ``start_pos``, and ``end_pos`` +columns. + +Query with DuckDB +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + # Load CSV file into database + engine.load_csv("variants", "variants.csv") + + # Register schema mapping + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + }, + genomic_column="interval", + ) + + # Query using the logical 'interval' column (returns cursor for streaming) + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Process results lazily + for row in cursor: + print(row) + + # Or materialize to pandas DataFrame + import pandas as pd + cursor = engine.execute("SELECT ...") + df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description]) + +Query with SQLite +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine: + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + # Iterate results + for row in cursor: + print(row) + +Spatial Operators +----------------- + +INTERSECTS +~~~~~~~~~~ + +Check if genomic ranges overlap: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + +CONTAINS +~~~~~~~~ + +Check if a range contains a point or another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS 'chr1:1500' + +WITHIN +~~~~~~ + +Check if a range is within another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval WITHIN 'chr1:1000-5000' + +Set Quantifiers +--------------- + +ANY +~~~ + +Match any of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + +ALL +~~~ + +Match all of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') + +Column-to-Column Joins +---------------------- + +Join tables on genomic position: + +.. code-block:: sql + + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + +Transpiling to SQL +------------------ + +The ``transpile()`` method converts GIQL queries to standard SQL without executing them. +This is useful for debugging, understanding the generated SQL, or integrating with external tools: + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + # Register table schema + engine.register_table_schema( + "variants", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + }, + genomic_column="interval", + ) + + # Transpile GIQL to SQL + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + + print(sql) + # Output: SELECT * FROM variants WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + +Different target dialects generate different SQL: + +.. code-block:: python + + # DuckDB dialect + with GIQLEngine(target_dialect="duckdb") as engine: + sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") + # Generates DuckDB-optimized SQL + + # SQLite dialect + with GIQLEngine(target_dialect="sqlite") as engine: + sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") + # Generates SQLite-compatible SQL + +The transpiled SQL can be executed directly on your database or used with other tools. +Use ``verbose=True`` when creating the engine to see detailed transpilation information: + +.. code-block:: python + + with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: + sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") + # Prints detailed information about the transpilation process diff --git a/docs/recipes/advanced-queries.rst b/docs/recipes/advanced-queries.rst new file mode 100644 index 0000000..2aaf944 --- /dev/null +++ b/docs/recipes/advanced-queries.rst @@ -0,0 +1,449 @@ +Advanced Queries +================ + +This section covers advanced query patterns including multi-range matching, +complex filtering, aggregate statistics, and multi-table workflows. + +.. contents:: + :local: + :depth: 2 + +Multi-Range Matching +-------------------- + +Match Any of Multiple Regions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find features overlapping any of several regions of interest: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000000-2000000', + 'chr1:5000000-6000000', + 'chr2:1000000-3000000' + ) + """) + +**Use case:** Query multiple regions of interest in a single statement. + +Match All of Multiple Points +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find features containing all specified positions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) + """) + +**Use case:** Find genes spanning a set of SNP positions. + +Exclude Multiple Regions +~~~~~~~~~~~~~~~~~~~~~~~~ + +Find features that don't overlap any blacklisted region: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:120000000-125000000', -- Centromere region + 'chr1:140000000-142000000', -- Known artifact + 'chrM:1-16569' -- Mitochondrial + ) + """) + +**Use case:** Filter out features in problematic genomic regions. + +Combine ANY and ALL +~~~~~~~~~~~~~~~~~~~ + +Complex multi-range logic: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') + """) + +**Use case:** Find features matching complex spatial criteria. + +Complex Filtering +----------------- + +Multi-Attribute Filtering +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Combine spatial and attribute filters: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name, g.biotype + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND v.filter = 'PASS' + AND v.allele_frequency > 0.01 + AND g.biotype = 'protein_coding' + ORDER BY v.chromosome, v.start_pos + """) + +**Use case:** Extract high-quality variants in protein-coding genes. + +Target Gene Lists +~~~~~~~~~~~~~~~~~ + +Filter to specific genes of interest: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ( + 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS', + 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM' + ) + ORDER BY g.name, v.start_pos + """) + +**Use case:** Extract variants in clinically actionable genes. + +Conditional Logic +~~~~~~~~~~~~~~~~~ + +Apply different criteria based on feature type: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name, g.biotype, + CASE + WHEN g.biotype = 'protein_coding' THEN 'coding' + WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA' + ELSE 'other' + END AS gene_category + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE CASE + WHEN g.biotype = 'protein_coding' THEN v.quality >= 30 + ELSE v.quality >= 20 + END + """) + +**Use case:** Apply different quality thresholds based on genomic context. + +Aggregate Statistics +-------------------- + +Per-Chromosome Statistics +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate summary statistics by chromosome: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.chromosome, + COUNT(DISTINCT a.name) AS total_features, + COUNT(b.name) AS total_overlaps, + COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome + ORDER BY a.chromosome + """) + +**Use case:** Compare feature distribution across chromosomes. + +Overlap Statistics +~~~~~~~~~~~~~~~~~~ + +Calculate overlap metrics: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.chromosome, + COUNT(*) AS overlap_count, + AVG(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS avg_overlap_bp, + SUM(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS total_overlap_bp + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome + ORDER BY a.chromosome + """) + +**Use case:** Quantify overlap patterns across the genome. + +Feature Size Distribution +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Analyze feature sizes by category: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + biotype, + COUNT(*) AS count, + AVG(end_pos - start_pos) AS avg_length, + MIN(end_pos - start_pos) AS min_length, + MAX(end_pos - start_pos) AS max_length + FROM genes + GROUP BY biotype + ORDER BY count DESC + """) + +**Use case:** Compare size distributions across feature types. + +Multi-Table Workflows +--------------------- + +Three-Way Intersection +~~~~~~~~~~~~~~~~~~~~~~ + +Find features overlapping in all three tables: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + INNER JOIN features_c c ON a.interval INTERSECTS c.interval + """) + +**Use case:** Find consensus regions across multiple datasets. + +Hierarchical Annotations +~~~~~~~~~~~~~~~~~~~~~~~~ + +Join multiple annotation levels: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + v.name AS variant, + e.name AS exon, + t.name AS transcript, + g.name AS gene + FROM variants v + INNER JOIN exons e ON v.interval INTERSECTS e.interval + INNER JOIN transcripts t ON e.interval WITHIN t.interval + INNER JOIN genes g ON t.interval WITHIN g.interval + """) + +**Use case:** Build hierarchical annotations for variants. + +Union with Deduplication +~~~~~~~~~~~~~~~~~~~~~~~~ + +Combine features from multiple sources: + +.. code-block:: python + + cursor = engine.execute(""" + WITH all_peaks AS ( + SELECT *, 'chip_seq' AS source FROM chip_peaks + UNION ALL + SELECT *, 'atac_seq' AS source FROM atac_peaks + UNION ALL + SELECT *, 'dnase_seq' AS source FROM dnase_peaks + ) + SELECT + chromosome, + start_pos, + end_pos, + STRING_AGG(DISTINCT source, ',') AS sources, + COUNT(DISTINCT source) AS source_count + FROM all_peaks + GROUP BY chromosome, start_pos, end_pos + HAVING COUNT(DISTINCT source) >= 2 + """) + +**Use case:** Find regulatory regions supported by multiple assays. + +Subqueries and CTEs +------------------- + +Filtered Subquery +~~~~~~~~~~~~~~~~~ + +Use subqueries to pre-filter data: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.* + FROM variants v + WHERE v.interval INTERSECTS ANY( + SELECT position FROM genes WHERE biotype = 'protein_coding' + ) + """) + +**Use case:** Intersect with dynamically filtered reference data. + +.. note:: + + Subquery support depends on the target database backend. + +Chained CTEs +~~~~~~~~~~~~ + +Build complex analyses with Common Table Expressions: + +.. code-block:: python + + cursor = engine.execute(""" + WITH + -- Step 1: Find high-quality variants + hq_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ), + -- Step 2: Annotate with genes + annotated AS ( + SELECT v.*, g.name AS gene_name, g.biotype + FROM hq_variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval + ), + -- Step 3: Summarize by gene + gene_summary AS ( + SELECT + gene_name, + biotype, + COUNT(*) AS variant_count + FROM annotated + WHERE gene_name IS NOT NULL + GROUP BY gene_name, biotype + ) + SELECT * FROM gene_summary + ORDER BY variant_count DESC + LIMIT 20 + """) + +**Use case:** Build multi-step analysis pipelines in a single query. + +Window Functions +---------------- + +Rank Overlaps +~~~~~~~~~~~~~ + +Rank features by their overlap characteristics: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.name, + a.chromosome, + a.start_pos, + overlap_count, + RANK() OVER (ORDER BY overlap_count DESC) AS rank + FROM ( + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand + ) a + """) + +**Use case:** Identify features with the most overlaps. + +Running Totals +~~~~~~~~~~~~~~ + +Calculate cumulative coverage: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + chromosome, + start_pos, + end_pos, + end_pos - start_pos AS length, + SUM(end_pos - start_pos) OVER ( + PARTITION BY chromosome + ORDER BY start_pos + ) AS cumulative_bp + FROM features + ORDER BY chromosome, start_pos + """) + +**Use case:** Track cumulative coverage along each chromosome. + +Debugging and Optimization +-------------------------- + +View Generated SQL +~~~~~~~~~~~~~~~~~~ + +Use transpile() to see the SQL GIQL generates: + +.. code-block:: python + + sql = engine.transpile(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + print(sql) + # See the actual SQL that will be executed + +**Use case:** Debug queries or understand GIQL's translation. + +Verbose Mode +~~~~~~~~~~~~ + +Enable detailed logging: + +.. code-block:: python + + with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: + # All queries will print transpilation details + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + +**Use case:** Diagnose query translation issues. + +Explain Query Plan +~~~~~~~~~~~~~~~~~~ + +Analyze query execution: + +.. code-block:: python + + # First transpile to get the SQL + sql = engine.transpile(""" + SELECT v.*, g.name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + """) + + # Then use database-native EXPLAIN + cursor = engine.execute(f"EXPLAIN {sql}") + for row in cursor: + print(row) + +**Use case:** Optimize slow queries by examining execution plans. diff --git a/docs/recipes/bedtools-migration.rst b/docs/recipes/bedtools-migration.rst new file mode 100644 index 0000000..74c27bd --- /dev/null +++ b/docs/recipes/bedtools-migration.rst @@ -0,0 +1,695 @@ +Bedtools Migration Guide +======================== + +This guide maps bedtools commands to their GIQL equivalents. If you're familiar +with bedtools and want to replicate specific operations in GIQL, use this +reference to find the corresponding query patterns. + +.. contents:: + :local: + :depth: 2 + +Quick Reference Table +--------------------- + +.. list-table:: + :header-rows: 1 + :widths: 35 45 20 + + * - Bedtools Command + - GIQL Equivalent + - Recipe + * - ``intersect -a A -b B`` + - ``SELECT DISTINCT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - :ref:`intersect-basic` + * - ``intersect -a A -b B -wa`` + - ``SELECT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - :ref:`intersect-wa` + * - ``intersect -a A -b B -wb`` + - ``SELECT b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - :ref:`intersect-wb` + * - ``intersect -a A -b B -wa -wb`` + - ``SELECT a.*, b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - :ref:`intersect-wawb` + * - ``intersect -a A -b B -v`` + - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chr IS NULL`` + - :ref:`intersect-v` + * - ``intersect -a A -b B -u`` + - ``SELECT DISTINCT a.* FROM a JOIN b ...`` + - :ref:`intersect-u` + * - ``intersect -a A -b B -c`` + - ``SELECT a.*, COUNT(b.name) ... GROUP BY ...`` + - :ref:`intersect-c` + * - ``intersect -a A -b B -wo`` + - ``SELECT a.*, b.*, (overlap calculation) ...`` + - :ref:`intersect-wo` + * - ``intersect -a A -b B -loj`` + - ``SELECT a.*, b.* FROM a LEFT JOIN b ...`` + - :ref:`intersect-loj` + * - ``closest -a A -b B -k N`` + - ``CROSS JOIN LATERAL NEAREST(b, reference=a.pos, k=N)`` + - :ref:`closest-k` + * - ``closest -a A -b B -d`` + - ``SELECT ..., DISTANCE(a.pos, b.pos) ...`` + - :ref:`closest-d` + * - ``cluster -i A`` + - ``SELECT *, CLUSTER(interval) AS cluster_id FROM a`` + - :ref:`cluster-basic` + * - ``cluster -i A -d N`` + - ``SELECT *, CLUSTER(interval, N) AS cluster_id FROM a`` + - :ref:`cluster-d` + * - ``merge -i A`` + - ``SELECT MERGE(interval) FROM a`` + - :ref:`merge-basic` + * - ``merge -i A -d N`` + - ``SELECT MERGE(interval, N) FROM a`` + - :ref:`merge-d` + * - ``merge -i A -c 1 -o count`` + - ``SELECT MERGE(interval), COUNT(*) FROM a`` + - :ref:`merge-count` + +bedtools intersect +------------------ + +.. _intersect-basic: + +Default: Report overlaps between A and B +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +.. _intersect-wa: + +``-wa``: Write original A entry for each overlap +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -wa + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +.. _intersect-wb: + +``-wb``: Write original B entry for each overlap +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -wb + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +.. _intersect-wawb: + +``-wa -wb``: Write both A and B entries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -wa -wb + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +.. _intersect-v: + +``-v``: Report A entries with NO overlap in B +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -v + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chromosome IS NULL + """) + +.. _intersect-u: + +``-u``: Report A entries with ANY overlap (unique) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -u + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +.. _intersect-c: + +``-c``: Count B overlaps for each A feature +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -c + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand + """) + +.. _intersect-wo: + +``-wo``: Write overlap amount in base pairs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -wo + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.*, + b.*, + (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +.. _intersect-wao: + +``-wao``: Write overlap amount for ALL A features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -wao + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.*, + b.*, + CASE + WHEN b.chromosome IS NULL THEN 0 + ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +.. _intersect-loj: + +``-loj``: Left outer join +~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -loj + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +``-s``: Same strand overlaps only +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -s + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + """) + +``-S``: Opposite strand overlaps only +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -S + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') + """) + +``-f``: Minimum overlap fraction of A +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -f 0.5 + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) + ) >= 0.5 * (a.end_pos - a.start_pos) + """) + +``-r``: Reciprocal overlap +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools intersect -a file_a.bed -b file_b.bed -f 0.5 -r + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + WITH overlap_calcs AS ( + SELECT + a.*, + (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, + (a.end_pos - a.start_pos) AS a_length, + (b.end_pos - b.start_pos) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT chromosome, start_pos, end_pos, name, score, strand + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length + """) + +bedtools closest +---------------- + +.. _closest-k: + +``-k``: Find k nearest features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools closest -a peaks.bed -b genes.bed -k 3 + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +.. _closest-d: + +``-d``: Report distance +~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools closest -a peaks.bed -b genes.bed -d + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.name AS peak, + b.name AS gene, + DISTANCE(a.interval, b.interval) AS distance + FROM peaks a + CROSS JOIN genes b + WHERE a.chromosome = b.chromosome + ORDER BY a.name, distance + """) + +Or using NEAREST for just the closest: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + """) + +``-s``: Same strand only +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools closest -a peaks.bed -b genes.bed -s -k 3 + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +bedtools cluster +---------------- + +.. _cluster-basic: + +Basic clustering +~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools cluster -i features.bed + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +.. _cluster-d: + +``-d``: Cluster with distance parameter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools cluster -i features.bed -d 1000 + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +``-s``: Strand-specific clustering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools cluster -i features.bed -s + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chromosome, strand, start_pos + """) + +bedtools merge +-------------- + +.. _merge-basic: + +Basic merge +~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval) + FROM features + """) + +.. _merge-d: + +``-d``: Merge with distance parameter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed -d 1000 + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, 1000) + FROM features + """) + +``-s``: Strand-specific merge +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed -s + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, stranded=true) + FROM features + """) + +.. _merge-count: + +``-c -o count``: Count merged features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed -c 1 -o count + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features + """) + +``-c -o mean``: Average score +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed -c 5 -o mean + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + AVG(score) AS avg_score + FROM features + """) + +``-c -o collapse``: Collect names +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Bedtools:** + +.. code-block:: bash + + bedtools merge -i features.bed -c 4 -o collapse + +**GIQL:** + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features + """) + +Key Differences from Bedtools +----------------------------- + +1. **SQL-based syntax**: GIQL uses SQL syntax, which may be more familiar to + users with database experience and allows integration with other SQL features. + +2. **Explicit joins**: Instead of implicit A/B file relationships, GIQL uses + explicit JOIN syntax, making the relationship between tables clearer. + +3. **Flexible output**: SQL's SELECT clause gives you full control over which + columns to return and how to format them. + +4. **Built-in aggregation**: SQL's GROUP BY and aggregate functions (COUNT, AVG, + SUM, etc.) are available directly, without needing separate post-processing. + +5. **Database integration**: GIQL queries run against database tables, enabling + integration with other data and persistence of results. + +6. **Multi-backend support**: The same GIQL query can run on DuckDB, SQLite, + or other supported backends without modification. diff --git a/docs/recipes/clustering-queries.rst b/docs/recipes/clustering-queries.rst new file mode 100644 index 0000000..6ff1487 --- /dev/null +++ b/docs/recipes/clustering-queries.rst @@ -0,0 +1,450 @@ +Clustering and Merging Queries +============================== + +This section covers patterns for clustering overlapping intervals and merging +them into unified regions using GIQL's aggregation operators. + +.. contents:: + :local: + :depth: 2 + +Basic Clustering +---------------- + +Assign Cluster IDs +~~~~~~~~~~~~~~~~~~ + +Assign unique cluster IDs to groups of overlapping intervals: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +**Use case:** Group overlapping peaks or annotations for downstream analysis. + +View Cluster Assignments +~~~~~~~~~~~~~~~~~~~~~~~~ + +See which features belong to which cluster: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + cluster_id, + chromosome, + name, + start_pos, + end_pos + FROM ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + ORDER BY cluster_id, start_pos + """) + +**Use case:** Inspect clustering results to understand feature groupings. + +Distance-Based Clustering +------------------------- + +Cluster with Gap Tolerance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Cluster intervals that are within a specified distance of each other: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + +**Use case:** Group nearby features even if they don't directly overlap +(e.g., cluster peaks within 1kb of each other). + +Variable Distance Thresholds +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Experiment with different clustering distances: + +.. code-block:: python + + # Tight clustering (overlapping only) + cursor = engine.execute(""" + SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features + """) + + # Medium clustering (within 500bp) + cursor = engine.execute(""" + SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features + """) + + # Loose clustering (within 5kb) + cursor = engine.execute(""" + SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features + """) + +**Use case:** Compare clustering at different resolutions for sensitivity analysis. + +Strand-Specific Clustering +-------------------------- + +Cluster by Strand +~~~~~~~~~~~~~~~~~ + +Cluster intervals separately for each strand: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chromosome, strand, start_pos + """) + +**Use case:** Maintain strand separation when clustering transcripts or +strand-specific regulatory elements. + +Strand-Specific with Distance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Combine strand awareness with distance tolerance: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + *, + CLUSTER(interval, 1000, stranded=true) AS cluster_id + FROM features + ORDER BY chromosome, strand, start_pos + """) + +**Use case:** Cluster nearby same-strand features while keeping opposite +strands separate. + +Cluster Statistics +------------------ + +Count Features per Cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate how many features are in each cluster: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chromosome, + COUNT(*) AS feature_count, + MIN(start_pos) AS cluster_start, + MAX(end_pos) AS cluster_end + FROM clustered + GROUP BY cluster_id, chromosome + ORDER BY chromosome, cluster_start + """) + +**Use case:** Identify cluster sizes and boundaries. + +Filter by Cluster Size +~~~~~~~~~~~~~~~~~~~~~~ + +Find clusters with a minimum number of features: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 + ORDER BY c.cluster_id, c.start_pos + """) + +**Use case:** Focus on regions with multiple overlapping features (hotspots). + +Cluster Summary Statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate statistics for each cluster: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chromosome, + COUNT(*) AS feature_count, + MIN(start_pos) AS cluster_start, + MAX(end_pos) AS cluster_end, + MAX(end_pos) - MIN(start_pos) AS cluster_span, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM clustered + GROUP BY cluster_id, chromosome + ORDER BY feature_count DESC + """) + +**Use case:** Rank clusters by size, span, or aggregate scores. + +Basic Merging +------------- + +Merge Overlapping Intervals +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Combine overlapping intervals into unified regions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval) + FROM features + """) + +**Use case:** Create non-overlapping consensus regions from redundant annotations. + +Merge with Distance +~~~~~~~~~~~~~~~~~~~ + +Merge intervals within a specified distance: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, 1000) + FROM features + """) + +**Use case:** Create broader regions by joining nearby features. + +Strand-Specific Merge +~~~~~~~~~~~~~~~~~~~~~ + +Merge intervals separately by strand: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT MERGE(interval, stranded=true) + FROM features + """) + +**Use case:** Create strand-aware consensus regions. + +Merge with Aggregations +----------------------- + +Count Merged Features +~~~~~~~~~~~~~~~~~~~~~ + +Track how many features were merged into each region: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features + """) + +**Use case:** Understand the complexity of each merged region. + +Aggregate Scores +~~~~~~~~~~~~~~~~ + +Calculate statistics for merged regions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score, + SUM(score) AS total_score + FROM features + """) + +**Use case:** Summarize signal intensity across merged regions. + +Collect Feature Names +~~~~~~~~~~~~~~~~~~~~~ + +List the names of features that were merged: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS merged_features + FROM features + """) + +**Use case:** Track provenance of merged regions. + +Coverage Calculations +--------------------- + +Total Base Pair Coverage +~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate total genomic coverage after merging: + +.. code-block:: python + + cursor = engine.execute(""" + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT SUM(end_pos - start_pos) AS total_coverage_bp + FROM merged + """) + +**Use case:** Calculate the total genome fraction covered by features. + +Coverage per Chromosome +~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate coverage for each chromosome: + +.. code-block:: python + + cursor = engine.execute(""" + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT + chromosome, + COUNT(*) AS region_count, + SUM(end_pos - start_pos) AS coverage_bp + FROM merged + GROUP BY chromosome + ORDER BY chromosome + """) + +**Use case:** Compare feature density across chromosomes. + +Coverage Reduction +~~~~~~~~~~~~~~~~~~ + +Compare raw vs merged coverage: + +.. code-block:: python + + cursor = engine.execute(""" + WITH raw_stats AS ( + SELECT + COUNT(*) AS raw_count, + SUM(end_pos - start_pos) AS raw_bp + FROM features + ), + merged_stats AS ( + SELECT + COUNT(*) AS merged_count, + SUM(end_pos - start_pos) AS merged_bp + FROM (SELECT MERGE(interval) FROM features) + ) + SELECT + raw_count, + merged_count, + raw_bp, + merged_bp, + ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct + FROM raw_stats, merged_stats + """) + +**Use case:** Quantify the redundancy in your feature set. + +Advanced Patterns +----------------- + +Cluster Then Merge +~~~~~~~~~~~~~~~~~~ + +First cluster features, then analyze each cluster: + +.. code-block:: python + + cursor = engine.execute(""" + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + MIN(chromosome) AS chromosome, + MIN(start_pos) AS start_pos, + MAX(end_pos) AS end_pos, + COUNT(*) AS feature_count, + STRING_AGG(name, ',') AS features + FROM clustered + GROUP BY cluster_id + ORDER BY chromosome, start_pos + """) + +**Use case:** Alternative to MERGE that preserves cluster identifiers. + +Hierarchical Clustering +~~~~~~~~~~~~~~~~~~~~~~~ + +Apply multiple clustering levels: + +.. code-block:: python + + cursor = engine.execute(""" + WITH level1 AS ( + SELECT *, CLUSTER(interval, 0) AS cluster_l1 + FROM features + ), + level2 AS ( + SELECT *, CLUSTER(interval, 1000) AS cluster_l2 + FROM level1 + ) + SELECT + cluster_l1, + cluster_l2, + chromosome, + name, + start_pos, + end_pos + FROM level2 + ORDER BY cluster_l2, cluster_l1, start_pos + """) + +**Use case:** Analyze feature relationships at multiple scales. diff --git a/docs/recipes/distance-queries.rst b/docs/recipes/distance-queries.rst new file mode 100644 index 0000000..41f9ede --- /dev/null +++ b/docs/recipes/distance-queries.rst @@ -0,0 +1,376 @@ +Distance and Proximity Queries +============================== + +This section covers patterns for calculating genomic distances and finding +nearest features using GIQL's distance operators. + +.. contents:: + :local: + :depth: 2 + +Calculating Distances +--------------------- + +Distance Between Feature Pairs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate the distance between features in two tables: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.name AS feature_a, + b.name AS feature_b, + DISTANCE(a.interval, b.interval) AS distance + FROM features_a a + CROSS JOIN features_b b + WHERE a.chromosome = b.chromosome + ORDER BY a.name, distance + """) + +**Use case:** Generate a distance matrix between regulatory elements and genes. + +.. note:: + + Always include ``WHERE a.chromosome = b.chromosome`` to avoid comparing + features on different chromosomes (which returns NULL for distance). + +Identify Overlapping vs Proximal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Classify relationships based on distance: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS dist, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)' + WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chromosome = g.chromosome + """) + +**Use case:** Categorize peak-gene relationships for enhancer analysis. + +Filter by Maximum Distance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find feature pairs within a distance threshold: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.name, + b.name, + DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chromosome = b.chromosome + AND DISTANCE(a.interval, b.interval) <= 50000 + ORDER BY dist + """) + +**Use case:** Find regulatory elements within 50kb of genes. + +K-Nearest Neighbor Queries +-------------------------- + +Find K Nearest Features +~~~~~~~~~~~~~~~~~~~~~~~ + +For each peak, find the 3 nearest genes: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Use case:** Annotate ChIP-seq peaks with nearby genes. + +Nearest Feature to a Specific Location +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find the 5 nearest genes to a specific genomic coordinate: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance + """) + +**Use case:** Explore the genomic neighborhood of a position of interest. + +Nearest with Distance Constraint +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find nearest features within a maximum distance: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Use case:** Find regulatory targets within 100kb, ignoring distant genes. + +Strand-Specific Queries +----------------------- + +Same-Strand Nearest Neighbors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find nearest features on the same strand only: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance + """) + +**Use case:** Find same-strand genes for strand-specific regulatory analysis. + +Directional Queries +------------------- + +Upstream Features +~~~~~~~~~~~~~~~~~ + +Find features upstream (5') of reference positions using signed distances: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC + """) + +**Use case:** Find genes upstream of regulatory elements. + +.. note:: + + With ``signed=true``, negative distances indicate upstream features + and positive distances indicate downstream features. + +Downstream Features +~~~~~~~~~~~~~~~~~~~ + +Find features downstream (3') of reference positions: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance + """) + +**Use case:** Identify downstream targets of promoter elements. + +Promoter-Proximal Analysis +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find features within a specific distance window around the reference: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -2000 AND 500 + ORDER BY peaks.name, ABS(nearest.distance) + """) + +**Use case:** Find genes with peaks in their promoter regions (-2kb to +500bp from TSS). + +Combined Parameters +------------------- + +Strand-Specific with Distance Constraint +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find nearby same-strand features: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) + """) + +**Use case:** Find same-strand genes within ±10kb for promoter-enhancer analysis. + +Distance Statistics +------------------- + +Average Distance to Nearest Gene +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calculate the average distance from peaks to their nearest gene: + +.. code-block:: python + + cursor = engine.execute(""" + WITH nearest_genes AS ( + SELECT + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance, + MIN(distance) AS min_distance, + MAX(distance) AS max_distance + FROM nearest_genes + """) + +**Use case:** Characterize the genomic distribution of peaks relative to genes. + +Distance Distribution by Chromosome +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Analyze distance patterns per chromosome: + +.. code-block:: python + + cursor = engine.execute(""" + WITH nearest_genes AS ( + SELECT + peaks.chromosome, + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + chromosome, + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance + FROM nearest_genes + GROUP BY chromosome + ORDER BY chromosome + """) + +**Use case:** Compare regulatory element distribution across chromosomes. + +Window Expansion Patterns +------------------------- + +Expand Search Window +~~~~~~~~~~~~~~~~~~~~ + +Find features within an expanded window around each feature: + +.. code-block:: python + + cursor = engine.execute(""" + WITH expanded AS ( + SELECT + name, + chromosome, + start_pos - 5000 AS search_start, + end_pos + 5000 AS search_end + FROM peaks + ) + SELECT + e.name AS peak, + b.* + FROM expanded e + JOIN features_b b + ON b.chromosome = e.chromosome + AND b.start_pos < e.search_end + AND b.end_pos > e.search_start + """) + +**Use case:** Find all features within 5kb flanking regions. + +.. note:: + + This pattern uses raw coordinate manipulation rather than the NEAREST + operator, which is useful when you need custom window shapes. diff --git a/docs/recipes/index.rst b/docs/recipes/index.rst new file mode 100644 index 0000000..f5d7a2c --- /dev/null +++ b/docs/recipes/index.rst @@ -0,0 +1,77 @@ +Recipes +======= + +This section provides practical examples and patterns for common genomic analysis tasks +using GIQL. Each recipe focuses on a specific use case with ready-to-use query patterns. + +.. contents:: + :local: + :depth: 1 + +Getting Started with Recipes +---------------------------- + +All recipes assume you have set up a GIQL engine and registered your table schemas: + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + # Load your data + engine.load_csv("features_a", "file_a.bed") + engine.load_csv("features_b", "file_b.bed") + + # Register schemas with genomic column mapping + for table in ["features_a", "features_b"]: + engine.register_table_schema( + table, + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "FLOAT", + "strand": "VARCHAR", + }, + genomic_column="interval", + ) + + # Now run queries from the recipes below + cursor = engine.execute("...") + +Recipe Categories +----------------- + +:doc:`intersect-queries` + Finding overlapping features, filtering by overlap, counting overlaps, + strand-specific operations, and join patterns. + +:doc:`distance-queries` + Calculating distances between features, finding nearest neighbors, + distance-constrained searches, and directional queries. + +:doc:`clustering-queries` + Clustering overlapping intervals, distance-based clustering, + merging intervals, and aggregating cluster statistics. + +:doc:`advanced-queries` + Multi-range matching, complex filtering with joins, aggregate statistics, + window expansions, and multi-table queries. + +Coming from Bedtools? +--------------------- + +If you're familiar with bedtools and want to replicate specific commands in GIQL, +see the :doc:`bedtools-migration` guide for a complete mapping of bedtools +operations to GIQL equivalents. + +.. toctree:: + :maxdepth: 2 + :hidden: + + intersect-queries + distance-queries + clustering-queries + advanced-queries + bedtools-migration diff --git a/docs/recipes/intersect-queries.rst b/docs/recipes/intersect-queries.rst new file mode 100644 index 0000000..fee0324 --- /dev/null +++ b/docs/recipes/intersect-queries.rst @@ -0,0 +1,379 @@ +Intersection Queries +==================== + +This section covers common patterns for finding overlapping genomic features +using GIQL's spatial operators. + +.. contents:: + :local: + :depth: 2 + +Finding Overlapping Features +---------------------------- + +Basic Overlap Query +~~~~~~~~~~~~~~~~~~~ + +Find all features in table A that overlap with any feature in table B: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +**Use case:** Identify variants that fall within gene regions. + +Get All Overlap Pairs +~~~~~~~~~~~~~~~~~~~~~ + +Return every pair of overlapping features (may produce duplicates if one +feature overlaps multiple others): + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +**Use case:** Generate a full overlap matrix for downstream analysis. + +Query Against a Specific Region +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find features overlapping a literal genomic range: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' + """) + +**Use case:** Extract all data for a specific chromosomal region. + +Filtering by Overlap +-------------------- + +Excluding Overlaps +~~~~~~~~~~~~~~~~~~ + +Find features in A that do NOT overlap with any feature in B: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chromosome IS NULL + """) + +**Use case:** Find regulatory regions that don't overlap with known genes, +or identify variants outside of exonic regions. + +Features with Any Overlap (Unique) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Return each feature from A only once, regardless of how many B features it overlaps: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +**Use case:** Get a deduplicated list of features that have at least one overlap. + +Counting Overlaps +----------------- + +Count Overlapping Features +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Count how many B features each A feature overlaps: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand + """) + +**Use case:** Calculate how many enhancers each gene overlaps with, +or count variants per feature. + +Filter by Overlap Count +~~~~~~~~~~~~~~~~~~~~~~~ + +Find features that overlap at least N other features: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand + HAVING COUNT(*) >= 3 + """) + +**Use case:** Identify hotspot regions with high feature density. + +Strand-Specific Operations +-------------------------- + +Same-Strand Overlaps +~~~~~~~~~~~~~~~~~~~~ + +Find overlapping features on the same strand: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + """) + +**Use case:** Find sense-strand overlaps for transcript analysis. + +Opposite-Strand Overlaps +~~~~~~~~~~~~~~~~~~~~~~~~ + +Find overlapping features on opposite strands: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') + """) + +**Use case:** Identify antisense overlaps or convergent transcription. + +Overlap Fraction Requirements +----------------------------- + +Minimum Overlap Fraction of A +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find overlaps where at least 50% of feature A is covered: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) + ) >= 0.5 * (a.end_pos - a.start_pos) + """) + +**Use case:** Ensure substantial overlap rather than just touching edges. + +Minimum Overlap Fraction of B +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Find overlaps where at least 50% of feature B is covered: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) + ) >= 0.5 * (b.end_pos - b.start_pos) + """) + +**Use case:** Find features that substantially cover smaller annotations. + +Reciprocal Overlap +~~~~~~~~~~~~~~~~~~ + +Require both features to have at least 50% mutual overlap: + +.. code-block:: python + + cursor = engine.execute(""" + WITH overlap_calcs AS ( + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, + (a.end_pos - a.start_pos) AS a_length, + (b.end_pos - b.start_pos) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT * + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length + """) + +**Use case:** Find high-confidence overlaps where features mutually cover each other. + +Join Patterns +------------- + +Left Outer Join +~~~~~~~~~~~~~~~ + +Report all features from A, with B information where available: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT a.*, b.name AS overlapping_feature + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +**Use case:** Annotate features with overlap information while keeping all records. + +Calculate Overlap Amount +~~~~~~~~~~~~~~~~~~~~~~~~ + +Return the overlap size in base pairs: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + """) + +**Use case:** Quantify the extent of each overlap. + +Overlap with NULL Handling +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Report overlap amount for all A features, with 0 for non-overlapping: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT + a.*, + b.name AS b_name, + CASE + WHEN b.chromosome IS NULL THEN 0 + ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + """) + +**Use case:** Create a complete overlap report including non-overlapping features. + +Multi-Table Operations +---------------------- + +Union Multiple Sources +~~~~~~~~~~~~~~~~~~~~~~ + +Intersect A with features from multiple B tables: + +.. code-block:: python + + # Load and register multiple tables first + engine.load_csv("features_b1", "file1.bed") + engine.load_csv("features_b2", "file2.bed") + engine.load_csv("features_b3", "file3.bed") + # Register schemas for each... + + cursor = engine.execute(""" + WITH all_b_features AS ( + SELECT * FROM features_b1 + UNION ALL + SELECT * FROM features_b2 + UNION ALL + SELECT * FROM features_b3 + ) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval + """) + +**Use case:** Find features overlapping any region from multiple annotation sources. + +Track Overlap Source +~~~~~~~~~~~~~~~~~~~~ + +Know which source table each overlap came from: + +.. code-block:: python + + cursor = engine.execute(""" + WITH all_b_features AS ( + SELECT *, 'source1' AS source FROM features_b1 + UNION ALL + SELECT *, 'source2' AS source FROM features_b2 + UNION ALL + SELECT *, 'source3' AS source FROM features_b3 + ) + SELECT a.*, b.name AS overlap_name, b.source + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval + """) + +**Use case:** Track which annotation database each overlap originated from. + +Complex Filtering +----------------- + +Overlap with Quality Filters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Combine spatial and attribute filters: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' + ORDER BY v.chromosome, v.start_pos + """) + +**Use case:** Find high-quality variants in protein-coding genes. + +Specific Target Genes +~~~~~~~~~~~~~~~~~~~~~ + +Find overlaps with a specific set of genes: + +.. code-block:: python + + cursor = engine.execute(""" + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR') + ORDER BY g.name, v.start_pos + """) + +**Use case:** Extract variants in clinically relevant genes. diff --git a/docs/reference/changelog.rst b/docs/reference/changelog.rst new file mode 100644 index 0000000..c541464 --- /dev/null +++ b/docs/reference/changelog.rst @@ -0,0 +1,87 @@ +Changelog +========= + +All notable changes to GIQL are documented here. + +The format is based on `Keep a Changelog `_, +and this project adheres to `Semantic Versioning `_. + +.. contents:: + :local: + :depth: 1 + +Unreleased +---------- + +*Changes in development, not yet released.* + +Added +~~~~~ + +- Comprehensive documentation with operator reference +- Recipe-based examples for common patterns +- Bedtools migration guide +- Multi-backend support guide +- Performance optimization guide + +Changed +~~~~~~~ + +- Documentation restructured with operator-first organization + +0.1.0 - Initial Release +----------------------- + +*Initial release of GIQL.* + +Added +~~~~~ + +**Core Features:** + +- SQL dialect for genomic interval queries +- Transpilation to standard SQL +- Multi-database backend support (DuckDB, SQLite) + +**Spatial Operators:** + +- ``INTERSECTS`` - Test range overlap +- ``CONTAINS`` - Test containment +- ``WITHIN`` - Test if range is within another + +**Distance Operators:** + +- ``DISTANCE`` - Calculate genomic distance +- ``NEAREST`` - K-nearest neighbor queries + +**Aggregation Operators:** + +- ``CLUSTER`` - Assign cluster IDs to overlapping intervals +- ``MERGE`` - Combine overlapping intervals + +**Set Quantifiers:** + +- ``ANY`` - Match any of multiple ranges +- ``ALL`` - Match all of multiple ranges + +**API:** + +- ``GIQLEngine`` - Main engine class +- ``execute()`` - Execute GIQL queries +- ``transpile()`` - Convert GIQL to SQL +- ``register_table_schema()`` - Register table schemas +- ``load_csv()`` - Load CSV files + +Version History +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Version + - Date + - Highlights + * - 0.1.0 + - TBD + - Initial release with core operators diff --git a/docs/reference/operator-matrix.rst b/docs/reference/operator-matrix.rst new file mode 100644 index 0000000..d761604 --- /dev/null +++ b/docs/reference/operator-matrix.rst @@ -0,0 +1,199 @@ +Operator Compatibility Matrix +============================= + +This reference shows which GIQL operators are supported by each database backend. + +Backend Support Overview +------------------------ + +.. list-table:: + :header-rows: 1 + :widths: 30 15 15 15 25 + + * - Operator + - DuckDB + - SQLite + - PostgreSQL + - Notes + * - **Spatial Operators** + - + - + - + - + * - INTERSECTS + - ✅ + - ✅ + - 🚧 + - + * - CONTAINS + - ✅ + - ✅ + - 🚧 + - + * - WITHIN + - ✅ + - ✅ + - 🚧 + - + * - **Distance Operators** + - + - + - + - + * - DISTANCE + - ✅ + - ✅ + - 🚧 + - + * - NEAREST + - ✅ + - ⚠️ + - 🚧 + - SQLite: slower for large k + * - **Aggregation Operators** + - + - + - + - + * - CLUSTER + - ✅ + - ✅ + - 🚧 + - + * - MERGE + - ✅ + - ✅ + - 🚧 + - + * - **Set Quantifiers** + - + - + - + - + * - ANY + - ✅ + - ✅ + - 🚧 + - + * - ALL + - ✅ + - ✅ + - 🚧 + - + +Legend +------ + +.. list-table:: + :widths: 10 90 + + * - ✅ + - **Full support** - Operator works with full functionality + * - ⚠️ + - **Partial support** - Operator works but with limitations + * - 🚧 + - **Planned** - Support planned for future release + * - ❌ + - **Not supported** - Operator not available for this backend + +Operator Details by Backend +--------------------------- + +DuckDB +~~~~~~ + +All operators are fully supported on DuckDB. DuckDB is the recommended backend +for most use cases due to its excellent analytical query performance. + +**Strengths:** + +- Efficient columnar storage +- Parallel query execution +- Full LATERAL join support (used by NEAREST) +- Rich window function support (used by CLUSTER) + +SQLite +~~~~~~ + +All operators work on SQLite, with some performance considerations: + +**NEAREST operator:** + +- Works correctly but may be slower for large k values +- Performance depends on table size and index availability +- Consider using ``max_distance`` to limit search space + +**CLUSTER and MERGE:** + +- Full functionality +- May be slower than DuckDB for very large datasets + +PostgreSQL (Planned) +~~~~~~~~~~~~~~~~~~~~ + +PostgreSQL support is planned for a future release. Expected to have full +operator support. + +SQL Feature Requirements +------------------------ + +GIQL operators require certain SQL features from the underlying database: + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - SQL Feature + - Used By + * - Basic predicates (AND, OR, comparison) + - All spatial operators + * - CASE expressions + - DISTANCE, conditional logic + * - LATERAL joins + - NEAREST + * - Window functions + - CLUSTER + * - Aggregate functions + - MERGE, COUNT, etc. + * - Common Table Expressions (WITH) + - Complex queries, MERGE + +Version Compatibility +--------------------- + +.. list-table:: + :header-rows: 1 + :widths: 25 25 50 + + * - Backend + - Minimum Version + - Notes + * - DuckDB + - 0.8.0+ + - Recommended: latest stable + * - SQLite + - 3.25.0+ + - Requires window function support + * - PostgreSQL + - 12+ + - Planned + +Checking Compatibility +---------------------- + +Verify operator support at runtime: + +.. code-block:: python + + from giql import GIQLEngine + + with GIQLEngine(target_dialect="duckdb") as engine: + # Transpile a query to verify it works + try: + sql = engine.transpile(""" + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """) + print("INTERSECTS supported") + except Exception as e: + print(f"INTERSECTS not supported: {e}") diff --git a/docs/reference/syntax-reference.rst b/docs/reference/syntax-reference.rst new file mode 100644 index 0000000..48cfb14 --- /dev/null +++ b/docs/reference/syntax-reference.rst @@ -0,0 +1,329 @@ +Syntax Reference +================ + +Quick reference for GIQL syntax and operators. + +.. contents:: + :local: + :depth: 2 + +Genomic Range Literals +---------------------- + +Format +~~~~~~ + +Genomic ranges are specified as string literals: + +.. code-block:: text + + 'chromosome:start-end' + +Examples +~~~~~~~~ + +.. code-block:: sql + + 'chr1:1000-2000' -- Range on chr1 from 1000 to 2000 + 'chr1:1000' -- Point at position 1000 + 'chrX:50000-100000' -- Range on chrX + 'chr1:0-1000000' -- First megabase of chr1 + +Coordinate System +~~~~~~~~~~~~~~~~~ + +- **0-based start**: First base is position 0 +- **Half-open interval**: [start, end) - start inclusive, end exclusive +- Range ``chr1:100-200`` covers bases 100 through 199 + +Spatial Operators +----------------- + +INTERSECTS +~~~~~~~~~~ + +Test if ranges overlap. + +.. code-block:: sql + + -- Against literal + interval INTERSECTS 'chr1:1000-2000' + + -- Column to column + a.interval INTERSECTS b.interval + + -- In JOIN + JOIN table ON a.interval INTERSECTS b.interval + +CONTAINS +~~~~~~~~ + +Test if one range fully contains another. + +.. code-block:: sql + + -- Range contains point + interval CONTAINS 'chr1:1500' + + -- Range contains range + interval CONTAINS 'chr1:1200-1800' + + -- Column to column + gene.interval CONTAINS exon.interval + +WITHIN +~~~~~~ + +Test if one range is fully within another. + +.. code-block:: sql + + -- Range within literal + interval WITHIN 'chr1:1000-5000' + + -- Column to column + exon.interval WITHIN gene.interval + +Distance Operators +------------------ + +DISTANCE +~~~~~~~~ + +Calculate distance between two positions. + +.. code-block:: sql + + DISTANCE(a.interval, b.interval) + +Returns: + +- ``0`` for overlapping ranges +- Positive integer (gap in bp) for non-overlapping +- ``NULL`` for different chromosomes + +NEAREST +~~~~~~~ + +Find k-nearest neighbors. + +.. code-block:: sql + + -- Basic syntax + CROSS JOIN LATERAL NEAREST( + target_table, + reference=source.interval, + k=N + ) AS alias + + -- With parameters + NEAREST( + target_table, + reference=interval, + k=5, + max_distance=100000, + stranded=true, + signed=true + ) + + -- Standalone + SELECT * FROM NEAREST(table, reference='chr1:1000-2000', k=5) + +Parameters: + +- ``k``: Number of neighbors (default: 1) +- ``max_distance``: Maximum distance threshold +- ``stranded``: Same-strand only (default: false) +- ``signed``: Signed distances (default: false) + +Aggregation Operators +--------------------- + +CLUSTER +~~~~~~~ + +Assign cluster IDs to overlapping intervals. + +.. code-block:: sql + + -- Basic + CLUSTER(interval) AS cluster_id + + -- With distance + CLUSTER(interval, 1000) AS cluster_id + + -- Strand-specific + CLUSTER(interval, stranded=true) AS cluster_id + + -- Combined + CLUSTER(interval, 1000, stranded=true) AS cluster_id + +MERGE +~~~~~ + +Combine overlapping intervals. + +.. code-block:: sql + + -- Basic + SELECT MERGE(interval) FROM table + + -- With distance + SELECT MERGE(interval, 1000) FROM table + + -- Strand-specific + SELECT MERGE(interval, stranded=true) FROM table + + -- With aggregations + SELECT MERGE(interval), COUNT(*), AVG(score) FROM table + +Set Quantifiers +--------------- + +ANY +~~~ + +Match any of multiple ranges. + +.. code-block:: sql + + interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + interval CONTAINS ANY('chr1:1500', 'chr1:2500') + interval WITHIN ANY('chr1:0-10000', 'chr2:0-10000') + +ALL +~~~ + +Match all of multiple ranges. + +.. code-block:: sql + + interval CONTAINS ALL('chr1:1500', 'chr1:1600', 'chr1:1700') + interval INTERSECTS ALL('chr1:1000-1100', 'chr1:1050-1150') + +Query Patterns +-------------- + +Basic Filter +~~~~~~~~~~~~ + +.. code-block:: sql + + SELECT * FROM table + WHERE interval INTERSECTS 'chr1:1000-2000' + +Join +~~~~ + +.. code-block:: sql + + SELECT a.*, b.name + FROM table_a a + JOIN table_b b ON a.interval INTERSECTS b.interval + +Left Outer Join +~~~~~~~~~~~~~~~ + +.. code-block:: sql + + SELECT a.*, b.name + FROM table_a a + LEFT JOIN table_b b ON a.interval INTERSECTS b.interval + +Exclusion (NOT IN) +~~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + SELECT a.* + FROM table_a a + LEFT JOIN table_b b ON a.interval INTERSECTS b.interval + WHERE b.chromosome IS NULL + +Count Overlaps +~~~~~~~~~~~~~~ + +.. code-block:: sql + + SELECT a.*, COUNT(b.name) AS overlap_count + FROM table_a a + LEFT JOIN table_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chromosome, a.start_pos, a.end_pos, ... + +K-Nearest Neighbors +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + SELECT source.*, nearest.name, nearest.distance + FROM source + CROSS JOIN LATERAL NEAREST(target, reference=source.interval, k=5) AS nearest + +Clustering +~~~~~~~~~~ + +.. code-block:: sql + + SELECT *, CLUSTER(interval) AS cluster_id + FROM table + ORDER BY chromosome, start_pos + +Merging +~~~~~~~ + +.. code-block:: sql + + SELECT MERGE(interval), COUNT(*) AS count + FROM table + +Engine Methods +-------------- + +execute() +~~~~~~~~~ + +Execute a GIQL query and return a cursor. + +.. code-block:: python + + cursor = engine.execute("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") + +transpile() +~~~~~~~~~~~ + +Convert GIQL to SQL without executing. + +.. code-block:: python + + sql = engine.transpile("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") + +register_table_schema() +~~~~~~~~~~~~~~~~~~~~~~~ + +Register a table's schema for genomic operations. + +.. code-block:: python + + engine.register_table_schema( + "table_name", + { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + chromosome_column="chromosome", # optional, default: "chromosome" + start_column="start_pos", # optional, default: "start_pos" + end_column="end_pos", # optional, default: "end_pos" + ) + +load_csv() +~~~~~~~~~~ + +Load a CSV file into a table. + +.. code-block:: python + + engine.load_csv("table_name", "file.csv") + engine.load_csv("table_name", "file.tsv", delimiter="\t") diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..78a122d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,8 @@ +# Sphinx and extensions +sphinx>=7.0.0 +sphinx-rtd-theme>=2.0.0 +sphinx-autodoc-typehints>=1.25.0 + +# Project dependencies needed for autodoc +sqlglot>=23.0.0 +pandas>=2.0.0 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..25874b1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,91 @@ +[build-system] +build-backend = "hatchling.build" +requires = [ + "debugpy", + "hatchling", + "packaging", + "gitpython", + "toml", + "typing-extensions", +] + +[project] +authors = [ + { name = "Nezar Abdennur", email = "nabdennur@gmail.com " }, + { name = "Conrad Bzura", email = "conradbzura@gmail.com" }, +] +dependencies = [ + "click>=8.3.0", + "duckdb>=1.4.0", + "oxbow>=0.4.0", + "pandas>=2.0.0", + "psycopg2-binary>=2.9.10", + "sqlglot>=20.0.0", + "sqlparse>=0.4.0", +] +description = "Genomic Interval Query Language - SQL dialect for genomic range queries" +dynamic = ["version"] +maintainers = [ + { name = "Nezar Abdennur", email = "nabdennur@gmail.com " }, + { name = "Conrad Bzura", email = "conradbzura@gmail.com" }, +] +name = "giql" +readme = "README.md" +requires-python = ">=3.11" + +[project.scripts] +giql = "giql.cli:cli" + +[project.optional-dependencies] +all = [ + "duckdb>=0.9.0", + "mysql-connector-python>=8.0.0", + "psycopg2-binary>=2.9.0", +] +dev = ["pytest-cov>=4.0.0", "pytest>=7.0.0", "ruff>=0.1.0", "hypothesis", "pybedtools"] +duckdb = ["duckdb>=0.9.0"] +mysql = ["mysql-connector-python>=8.0.0"] +postgres = ["psycopg2-binary>=2.9.0"] +sqlite = [] + +[tool.hatch.metadata.hooks.custom] +path = "build-hooks/metadata.py" + +[tool.pytest.ini_options] +addopts = "--cov --cov-config=.coveragerc" + +[tool.ruff] +line-length = 89 + +[tool.ruff.format] +docstring-code-format = true +quote-style = "double" + +[tool.ruff.lint] +select = ["E", "F", "I"] + +[tool.ruff.lint.isort] +combine-as-imports = false +force-single-line = true +known-first-party = ["giql"] + +[tool.pixi.workspace] +channels = ["conda-forge", "bioconda"] +platforms = ["osx-arm64", "osx-64", "linux-64"] + +[tool.pixi.dependencies] +python = ">=3.11" +bedtools = ">=2.31.0" +pybedtools = ">=0.9.0" +pytest = ">=7.0.0" +pytest-cov = ">=4.0.0" +click = ">=8.3.0" +duckdb = ">=1.4.0" +pandas = ">=2.0.0" +pyarrow = ">=19.0.0" +psycopg2-binary = ">=2.9.10" +sqlglot = ">=20.0.0" +pip = "*" +oxbow = ">=0.4.0" +sqlparse = ">=0.4.0" +hypothesis = ">=6.148.2,<7" diff --git a/src/giql/__init__.py b/src/giql/__init__.py new file mode 100644 index 0000000..9ee8e16 --- /dev/null +++ b/src/giql/__init__.py @@ -0,0 +1,52 @@ +"""GIQL - Genomic Interval Query Language. + +A SQL dialect for genomic range queries with multi-database support. + +This package provides: + - GIQL dialect extending SQL with spatial operators + - Query engine supporting multiple backends (DuckDB, SQLite) + - Range parser for genomic coordinate strings + - Schema management for genomic data +""" + +from giql.constants import DEFAULT_CHROM_COL as DEFAULT_CHROM_COL +from giql.constants import DEFAULT_END_COL as DEFAULT_END_COL +from giql.constants import DEFAULT_GENOMIC_COL as DEFAULT_GENOMIC_COL +from giql.constants import DEFAULT_START_COL as DEFAULT_START_COL +from giql.constants import DEFAULT_STRAND_COL as DEFAULT_STRAND_COL +from giql.dialect import GIQLDialect as GIQLDialect +from giql.engine import DialectType as DialectType +from giql.engine import GIQLEngine as GIQLEngine +from giql.generators import BaseGIQLGenerator as BaseGIQLGenerator +from giql.generators import GIQLDuckDBGenerator as GIQLDuckDBGenerator +from giql.protocols import CursorLike as CursorLike +from giql.range_parser import CoordinateSystem as CoordinateSystem +from giql.range_parser import IntervalType as IntervalType +from giql.range_parser import ParsedRange as ParsedRange +from giql.range_parser import RangeParser as RangeParser +from giql.schema import ColumnInfo as ColumnInfo +from giql.schema import SchemaInfo as SchemaInfo +from giql.schema import TableSchema as TableSchema + +__version__ = "0.1.0" + + +__all__ = [ + "BaseGIQLGenerator", + "GIQLDuckDBGenerator", + "GIQLEngine", + "DialectType", + "GIQLDialect", + "RangeParser", + "ParsedRange", + "CoordinateSystem", + "IntervalType", + "SchemaInfo", + "TableSchema", + "ColumnInfo", + "DEFAULT_CHROM_COL", + "DEFAULT_START_COL", + "DEFAULT_END_COL", + "DEFAULT_STRAND_COL", + "DEFAULT_GENOMIC_COL", +] diff --git a/src/giql/cli.py b/src/giql/cli.py new file mode 100644 index 0000000..d714075 --- /dev/null +++ b/src/giql/cli.py @@ -0,0 +1,683 @@ +"""Command-line interface for GIQL. + +This module provides a CLI that mirrors bedtools intersect functionality +using GIQL's genomic query capabilities. +""" + +import sys +from pathlib import Path + +import click +import duckdb +from oxbow import from_bam +from oxbow import from_bed +from oxbow import from_gff +from oxbow import from_gtf +from oxbow import from_vcf + +from giql import GIQLEngine + + +@click.group() +@click.version_option() +def cli(): + """GIQL - Genomic Interval Query Language. + + SQL-based toolkit for genomic range queries. + """ + pass + + +def _detect_file_format(file_path: Path) -> str: + """Detect genomic file format from file extension. + + :param file_path: + Path to the file + :return: + Format identifier: 'bed', 'bam', 'vcf', 'gff', 'gtf' + :raises click.ClickException: + If format cannot be determined + """ + # Handle compressed files + suffixes = file_path.suffixes + if suffixes[-1] == ".gz": + # Remove .gz and check the actual format + ext = suffixes[-2] if len(suffixes) >= 2 else "" + else: + ext = file_path.suffix + + ext = ext.lower() + + format_map = { + ".bed": "bed", + ".bam": "bam", + ".vcf": "vcf", + ".gff": "gff", + ".gff3": "gff", + ".gtf": "gtf", + } + + if ext in format_map: + return format_map[ext] + + raise click.ClickException( + f"Unsupported file format: {ext}. Supported formats: BED, BAM, VCF, GFF, GTF" + ) + + +def _load_genomic_file( + conn: duckdb.DuckDBPyConnection, file_path: Path, table_name: str +) -> dict[str, str]: + """Load genomic file using appropriate oxbow function. + + :param conn: + DuckDB connection + :param file_path: + Path to genomic file + :param table_name: + Name for the table to create + :return: + Dictionary mapping column names to types + :raises click.ClickException: + If file cannot be loaded + """ + fmt = _detect_file_format(file_path) + compression = "gzip" if file_path.suffix == ".gz" else None + + try: + match fmt: + case "bed": + df = from_bed(str(file_path), compression=compression).to_duckdb(conn) + case "bam": + df = from_bam(str(file_path)).to_duckdb(conn) + case "vcf": + df = from_vcf(str(file_path), compression=compression).to_duckdb(conn) + case "gff": + df = from_gff(str(file_path), compression=compression).to_duckdb(conn) + case "gtf": + df = from_gtf(str(file_path), compression=compression).to_duckdb(conn) + case _: + raise click.ClickException(f"Unsupported format: {fmt}") + + conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df") + + # Get column information + col_info = conn.execute(f"DESCRIBE {table_name}").fetchall() + return {col[0]: col[1] for col in col_info} + + except Exception as e: + raise click.ClickException(f"Failed to load {file_path}: {e}") + + +def _expand_rest_columns(df): + """Expand 'rest' columns from BED files into separate columns. + + BED files store extra fields beyond chrom/start/end in a 'rest' column + as a tab-delimited string. This function expands those into separate columns + to match bedtools output format. + + :param df: + DataFrame with potential 'rest' columns + :return: + DataFrame with rest columns expanded + """ + import pandas as pd + + # pandas.read_sql can return duplicate column names when joining + # Find all 'rest' column positions + rest_indices = [i for i, col in enumerate(df.columns) if col == "rest"] + + if not rest_indices: + return df + + # Build new dataframe with expanded columns + # We need to handle duplicate column names, so we can't use a dict + new_data = {} + new_col_names = [] + + for i, col in enumerate(df.columns): + if col == "rest" and i in rest_indices: + # Expand this rest column + col_data = df.iloc[:, i] + expanded = col_data.fillna("").astype(str).str.split("\t", expand=True) + + # Add all expanded columns with unique names + for j in range(expanded.shape[1]): + col_name = f"field_{j + 4}" + # Make unique if duplicate + base_name = col_name + counter = 0 + while col_name in new_col_names: + counter += 1 + col_name = f"{base_name}_{counter}" + new_col_names.append(col_name) + new_data[col_name] = expanded[j] + else: + # Keep non-rest columns as-is + # Make unique names for duplicates + col_name = col + base_name = col_name + counter = 0 + while col_name in new_col_names: + counter += 1 + col_name = f"{base_name}_{counter}" + new_col_names.append(col_name) + new_data[col_name] = df.iloc[:, i] + + # Rebuild dataframe with explicit column order + result = pd.DataFrame(new_data, columns=new_col_names) + return result + + +def _detect_genomic_columns(columns: dict[str, str]) -> dict[str, str | None]: + """Detect genomic coordinate columns from available columns. + + :param columns: + Dictionary of column name -> type + :return: + Dictionary with keys: chrom_col, start_col, end_col, strand_col + """ + col_names = {c.lower(): c for c in columns.keys()} + + # Chromosome column patterns (in priority order) + chrom_col = None + for pattern in ["chrom", "seqid", "chr", "chromosome", "contig", "seqname"]: + if pattern in col_names: + chrom_col = col_names[pattern] + break + + # Start column patterns + start_col = None + for pattern in [ + "start", + "chromstart", + "pos", + "begin", + "txstart", + "cdsstart", + "thickstart", + ]: + if pattern in col_names: + start_col = col_names[pattern] + break + + # End column patterns + end_col = None + for pattern in [ + "end", + "chromend", + "stop", + "txend", + "cdsend", + "thickend", + ]: + if pattern in col_names: + end_col = col_names[pattern] + break + + # Strand column patterns + strand_col = None + for pattern in ["strand", "str", "orientation"]: + if pattern in col_names: + strand_col = col_names[pattern] + break + + return { + "chrom_col": chrom_col, + "start_col": start_col, + "end_col": end_col, + "strand_col": strand_col, + } + + +@cli.command() +@click.option( + "-a", + "--file-a", + required=True, + type=click.Path(exists=True), + help="BAM/BED/GFF/VCF file 'A'. Each feature in A is compared to B.", +) +@click.option( + "-b", + "--file-b", + required=True, + multiple=True, + type=click.Path(exists=True), + help="One or more BAM/BED/GFF/VCF files for comparison.", +) +@click.option( + "-wa", + "--write-a", + is_flag=True, + help="Write the original entry in A for each overlap.", +) +@click.option( + "-wb", + "--write-b", + is_flag=True, + help="Write the original entry in B for each overlap.", +) +@click.option( + "-loj", + "--left-outer-join", + is_flag=True, + help="Perform left outer join. Report all A features with NULL B when no overlap.", +) +@click.option( + "-wo", + "--write-overlap", + is_flag=True, + help="Write the number of overlapping base pairs between features.", +) +@click.option( + "-wao", + "--write-all-overlap", + is_flag=True, + help="Like -wo but includes A features with zero overlap.", +) +@click.option( + "-u", + "--unique", + is_flag=True, + help="Report each A feature only once if any overlap exists in B.", +) +@click.option( + "-c", + "--count", + is_flag=True, + help="For each entry in A, report the number of overlaps in B.", +) +@click.option( + "-v", + "--invert", + is_flag=True, + help="Only report entries in A that have no overlap in B.", +) +@click.option( + "-f", + "--fraction-a", + type=float, + help="Minimum overlap as fraction of A.", +) +@click.option( + "-F", + "--fraction-b", + type=float, + help="Minimum overlap as fraction of B.", +) +@click.option( + "-r", + "--reciprocal", + is_flag=True, + help="Require reciprocal overlap fraction for both A and B.", +) +@click.option( + "-e", + "--either", + is_flag=True, + help="Require that -f OR -F be satisfied (not both).", +) +@click.option( + "-s", + "--same-strand", + is_flag=True, + help="Require same strand for overlaps.", +) +@click.option( + "-S", + "--opposite-strand", + is_flag=True, + help="Require opposite strand for overlaps.", +) +@click.option( + "--header", + is_flag=True, + help="Print the header from A before results.", +) +@click.option( + "--names", + multiple=True, + help="Aliases for B files (instead of file numbers).", +) +@click.option( + "-sorted", + "--sorted-input", + is_flag=True, + help="For compatibility with bedtools (currently ignored).", +) +@click.option( + "--chunksize", + type=int, + help="Process results in chunks of N rows (streaming mode for large datasets).", +) +def intersect( + file_a, + file_b, + write_a, + write_b, + left_outer_join, + write_overlap, + write_all_overlap, + unique, + count, + invert, + fraction_a, + fraction_b, + reciprocal, + either, + same_strand, + opposite_strand, + header, + names, + sorted_input, + chunksize, +): + """Find overlaps between genomic features. + + Similar to bedtools intersect, this command finds overlapping intervals + between files A and B using GIQL's spatial operators. + + Supports BED, BAM, VCF, GFF, and GTF formats (gzip compressed or uncompressed). + """ + # Validate conflicting options + if same_strand and opposite_strand: + raise click.UsageError("Cannot use -s and -S together") + + output_modes = [ + write_a, + write_b, + left_outer_join, + write_overlap, + write_all_overlap, + unique, + count, + invert, + ] + if sum(output_modes) > 1: + raise click.UsageError("Can only specify one output mode") + + # Create DuckDB connection + conn = duckdb.connect() + + # Initialize engine with existing connection + engine = GIQLEngine(target_dialect="duckdb", connection=conn) + + try: + # Load file A + file_a_path = Path(file_a) + table_a = "file_a" + columns_a = _load_genomic_file(conn, file_a_path, table_a) + + # Detect genomic columns + genomic_cols_a = _detect_genomic_columns(columns_a) + + if not all( + [ + genomic_cols_a["chrom_col"], + genomic_cols_a["start_col"], + genomic_cols_a["end_col"], + ] + ): + raise click.ClickException( + f"Could not detect genomic columns in {file_a}. " + f"Found columns: {list(columns_a.keys())}" + ) + + # Register schema for file A + engine.register_table_schema( + table_a, + columns_a, + genomic_column="interval", + chrom_col=genomic_cols_a["chrom_col"], + start_col=genomic_cols_a["start_col"], + end_col=genomic_cols_a["end_col"], + strand_col=genomic_cols_a["strand_col"], + ) + + # Process file(s) B + results = [] + for idx, b_file in enumerate(file_b): + b_path = Path(b_file) + table_b = f"file_b_{idx}" + + # Load file B + columns_b = _load_genomic_file(conn, b_path, table_b) + + # Detect genomic columns in B + genomic_cols_b = _detect_genomic_columns(columns_b) + + if not all( + [ + genomic_cols_b["chrom_col"], + genomic_cols_b["start_col"], + genomic_cols_b["end_col"], + ] + ): + raise click.ClickException( + f"Could not detect genomic columns in {b_file}" + ) + + # Register schema for file B + engine.register_table_schema( + table_b, + columns_b, + genomic_column="region", + chrom_col=genomic_cols_b["chrom_col"], + start_col=genomic_cols_b["start_col"], + end_col=genomic_cols_b["end_col"], + strand_col=genomic_cols_b["strand_col"], + ) + + # Build query based on options + query = _build_intersect_query( + table_a=table_a, + table_b=table_b, + chrom_a=genomic_cols_a["chrom_col"], + start_a=genomic_cols_a["start_col"], + end_a=genomic_cols_a["end_col"], + strand_a=genomic_cols_a["strand_col"], + chrom_b=genomic_cols_b["chrom_col"], + start_b=genomic_cols_b["start_col"], + end_b=genomic_cols_b["end_col"], + strand_b=genomic_cols_b["strand_col"], + write_a=write_a, + write_b=write_b, + left_outer_join=left_outer_join, + write_overlap=write_overlap, + write_all_overlap=write_all_overlap, + unique=unique, + count=count, + invert=invert, + same_strand=same_strand, + opposite_strand=opposite_strand, + fraction_a=fraction_a, + fraction_b=fraction_b, + reciprocal=reciprocal, + either=either, + ) + + # Execute query and get cursor + cursor = engine.execute(query) + + # Get column names + col_names = [desc[0] for desc in cursor.description] + + # Output header if requested (only once, before first row) + if header and idx == 0: + print("\t".join(col_names)) + + # Stream results row by row + while True: + row = cursor.fetchone() + if row is None: + break + # Expand rest columns inline + output_fields = [] + for i, value in enumerate(row): + col_name = col_names[i] + if col_name == "rest" and value: + # Expand rest column - split on tabs + rest_fields = str(value).split("\t") + output_fields.extend(rest_fields) + else: + output_fields.append(str(value) if value is not None else "") + + # Add file identifier if needed + if names and idx < len(names): + output_fields.append(names[idx]) + elif len(file_b) > 1: + output_fields.append(b_path.name) + + # Output row as TSV + print("\t".join(output_fields)) + + finally: + engine.close() + + +def _build_intersect_query( + table_a: str, + table_b: str, + chrom_a: str, + start_a: str, + end_a: str, + strand_a: str | None, + chrom_b: str, + start_b: str, + end_b: str, + strand_b: str | None, + write_a: bool = False, + write_b: bool = False, + left_outer_join: bool = False, + write_overlap: bool = False, + write_all_overlap: bool = False, + unique: bool = False, + count: bool = False, + invert: bool = False, + same_strand: bool = False, + opposite_strand: bool = False, + fraction_a: float | None = None, + fraction_b: float | None = None, + reciprocal: bool = False, + either: bool = False, +) -> str: + """Build GIQL query based on intersect options.""" + + # Build strand filter if needed + strand_filter = "" + if same_strand and strand_a and strand_b: + strand_filter = f' AND a."{strand_a}" = b."{strand_b}"' + elif opposite_strand and strand_a and strand_b: + strand_filter = f' AND a."{strand_a}" != b."{strand_b}"' + + # Build fraction filter if needed + fraction_filter = "" + if fraction_a or fraction_b: + filters = [] + + if fraction_a: + # Overlap must be at least fraction_a of A's length + overlap_expr = ( + f'LEAST(a."{end_a}", b."{end_b}") - ' + f'GREATEST(a."{start_a}", b."{start_b}")' + ) + a_length = f'(a."{end_a}" - a."{start_a}")' + filters.append(f"({overlap_expr}::FLOAT / {a_length} >= {fraction_a})") + + if fraction_b: + # Overlap must be at least fraction_b of B's length + overlap_expr = ( + f'LEAST(a."{end_a}", b."{end_b}") - ' + f'GREATEST(a."{start_a}", b."{start_b}")' + ) + b_length = f'(b."{end_b}" - b."{start_b}")' + filters.append(f"({overlap_expr}::FLOAT / {b_length} >= {fraction_b})") + + # Combine filters based on reciprocal/either flags + if reciprocal and len(filters) == 2: + # Both must be satisfied (AND) + fraction_filter = f" AND ({filters[0]} AND {filters[1]})" + elif either and len(filters) == 2: + # Either must be satisfied (OR) + fraction_filter = f" AND ({filters[0]} OR {filters[1]})" + elif filters: + # Just one filter or default behavior + fraction_filter = f" AND {' AND '.join(filters)}" + + if invert: + # Only features in A with no overlap in B + where_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" + return f""" + SELECT a.* + FROM {table_a} a + WHERE NOT EXISTS ( + SELECT 1 FROM {table_b} b + WHERE {where_clause} + ) + """ + + if count: + # Count overlaps + # Get all columns from table A for GROUP BY + on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" + return f""" + SELECT a.*, COUNT(b.\"{chrom_b}\") as overlap_count + FROM {table_a} a + LEFT JOIN {table_b} b ON {on_clause} + GROUP BY ALL + """ + + if unique: + # Report each A feature only once if overlaps exist + on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" + return f""" + SELECT DISTINCT a.* + FROM {table_a} a + JOIN {table_b} b ON {on_clause} + """ + + if left_outer_join or write_all_overlap: + # Left outer join + join_type = "LEFT JOIN" + else: + join_type = "JOIN" + + # Build select clause + if write_a and not write_b: + select_clause = "a.*" + elif write_b and not write_a: + select_clause = "b.*" + else: + # Default: write both A and B + select_clause = "a.*, b.*" + + # Add overlap calculation if requested + if write_overlap or write_all_overlap: + # Calculate overlap size: min(end_a, end_b) - max(start_a, start_b) + overlap_expr = f""" + CASE + WHEN b.\"{chrom_b}\" IS NULL THEN 0 + ELSE GREATEST(0, + LEAST(a.\"{end_a}\", b.\"{end_b}\") - + GREATEST(a.\"{start_a}\", b.\"{start_b}\") + ) + END as overlap_bp + """ + select_clause = f"{select_clause}, {overlap_expr}" + + # Build ON clause + on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" + + # Build base query + query = f""" + SELECT {select_clause} + FROM {table_a} a + {join_type} {table_b} b ON {on_clause} + """ + + return query + + +if __name__ == "__main__": + cli() diff --git a/src/giql/constants.py b/src/giql/constants.py new file mode 100644 index 0000000..daa5896 --- /dev/null +++ b/src/giql/constants.py @@ -0,0 +1,11 @@ +"""Default constants for GIQL. + +This module defines default column names and other constants used throughout GIQL. +""" + +# Default genomic column names +DEFAULT_CHROM_COL = "chromosome" +DEFAULT_START_COL = "start_pos" +DEFAULT_END_COL = "end_pos" +DEFAULT_STRAND_COL = "strand" +DEFAULT_GENOMIC_COL = "interval" diff --git a/src/giql/dialect.py b/src/giql/dialect.py new file mode 100644 index 0000000..6c70104 --- /dev/null +++ b/src/giql/dialect.py @@ -0,0 +1,130 @@ +"""Custom SQL dialect with genomic extensions. + +This module defines the GIQL dialect, which extends standard SQL with +spatial operators for genomic interval queries. +""" + +from typing import Final + +from sqlglot.dialects import Dialect +from sqlglot.parser import Parser +from sqlglot.tokens import Tokenizer +from sqlglot.tokens import TokenType + +from giql.expressions import Contains +from giql.expressions import GIQLCluster +from giql.expressions import GIQLDistance +from giql.expressions import GIQLMerge +from giql.expressions import GIQLNearest +from giql.expressions import Intersects +from giql.expressions import SpatialSetPredicate +from giql.expressions import Within + +# Token type constants +INTERSECTS: Final = "INTERSECTS" +CONTAINS: Final = "CONTAINS" +WITHIN: Final = "WITHIN" + +# Register custom token types +setattr(TokenType, INTERSECTS, INTERSECTS) +setattr(TokenType, CONTAINS, CONTAINS) +setattr(TokenType, WITHIN, WITHIN) + + +class GIQLDialect(Dialect): + """Generic SQL dialect with genomic spatial operators.""" + + class Tokenizer(Tokenizer): + """Tokenizer with genomic keywords. + + Extends the base tokenizer to recognize GIQL spatial operators + (INTERSECTS, CONTAINS, WITHIN). + """ + + KEYWORDS = { + **Tokenizer.KEYWORDS, + INTERSECTS: getattr(TokenType, INTERSECTS), + CONTAINS: getattr(TokenType, CONTAINS), + WITHIN: getattr(TokenType, WITHIN), + } + + class Parser(Parser): + """Parser with genomic predicate support.""" + + FUNCTIONS = { + **Parser.FUNCTIONS, + "CLUSTER": GIQLCluster.from_arg_list, + "MERGE": GIQLMerge.from_arg_list, + "DISTANCE": GIQLDistance.from_arg_list, + "NEAREST": GIQLNearest.from_arg_list, + } + + def _parse_comparison(self): + """Override to handle spatial operators. + + :return: + Parsed spatial expression or falls back to parent's comparison parsing + """ + return self._parse_spatial() or super()._parse_comparison() + + def _parse_spatial(self): + """Parse spatial predicates. + + Handles: + - column INTERSECTS 'chr1:1000-2000' + - column INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + - column CONTAINS 'chr1:1500' + - column WITHIN 'chr1:1000-5000' + + :return: + Parsed spatial expression or None if no spatial operator found + """ + start_index = self._index + this = self._parse_term() + + if self._match(getattr(TokenType, INTERSECTS)): + return self._parse_spatial_predicate(this, INTERSECTS, Intersects) + elif self._match(getattr(TokenType, CONTAINS)): + return self._parse_spatial_predicate(this, CONTAINS, Contains) + elif self._match(getattr(TokenType, WITHIN)): + return self._parse_spatial_predicate(this, WITHIN, Within) + + # No spatial operator found - retreat and return None to allow fallback + self._retreat(start_index) + return None + + def _parse_spatial_predicate(self, left, operator, expr_class): + """Parse right side of spatial predicate. + + :param left: + Left side expression (column reference) + :param operator: + Spatial operator token (INTERSECTS, CONTAINS, WITHIN) + :param expr_class: + Expression class to instantiate (Intersects, Contains, Within) + :return: + Parsed spatial predicate expression + """ + # Check for ANY/ALL quantifier + if self._match_set((TokenType.ANY, TokenType.ALL, TokenType.SOME)): + assert self._prev is not None, "Expected token after successful match" + quantifier = self._prev.text.upper() + if quantifier == "SOME": + quantifier = "ANY" + + # Parse range list + self._match_l_paren() + ranges = self._parse_csv(self._parse_expression) + self._match_r_paren() + + return self.expression( + SpatialSetPredicate, + this=left, + operator=operator, + quantifier=quantifier, + ranges=ranges, + ) + else: + # Simple spatial predicate + right = self._parse_term() + return self.expression(expr_class, this=left, expression=right) diff --git a/src/giql/engine.py b/src/giql/engine.py new file mode 100644 index 0000000..d9a013c --- /dev/null +++ b/src/giql/engine.py @@ -0,0 +1,370 @@ +"""Multi-backend query engine for GIQL. + +This module provides the main query engine that supports multiple SQL databases +through transpilation of GIQL syntax to standard SQL. +""" + +from typing import Literal + +import pandas as pd +from sqlglot import parse_one + +from giql.constants import DEFAULT_CHROM_COL +from giql.constants import DEFAULT_END_COL +from giql.constants import DEFAULT_GENOMIC_COL +from giql.constants import DEFAULT_START_COL +from giql.constants import DEFAULT_STRAND_COL +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator +from giql.generators import GIQLDuckDBGenerator +from giql.protocols import CursorLike +from giql.range_parser import CoordinateSystem +from giql.range_parser import IntervalType +from giql.schema import ColumnInfo +from giql.schema import SchemaInfo +from giql.schema import TableSchema +from giql.transformer import ClusterTransformer +from giql.transformer import MergeTransformer + +DialectType = Literal["duckdb", "sqlite"] + + +class GIQLEngine: + """Multi-backend GIQL query engine. + + Supports multiple SQL databases through transpilation of GIQL syntax + to standard SQL. Can work with DuckDB, SQLite, and other backends. + + Examples + -------- + Query a pandas DataFrame with DuckDB:: + + import pandas as pd + + df = pd.DataFrame( + { + "id": [1, 2, 3], + "chromosome": ["chr1", "chr1", "chr2"], + "start_pos": [1500, 10500, 500], + "end_pos": [1600, 10600, 600], + } + ) + with GIQLEngine(target_dialect="duckdb") as engine: + engine.conn.register("variants", df) + cursor = engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + for row in cursor: + print(row) + + Load from CSV:: + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", "variants.csv") + cursor = engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + # Process rows lazily + while True: + row = cursor.fetchone() + if row is None: + break + print(row) + + Using SQLite backend:: + + with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine: + cursor = engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + # Materialize all results at once + results = cursor.fetchall() + """ + + def __init__( + self, + target_dialect: DialectType | str = "duckdb", + connection=None, + db_path: str = ":memory:", + verbose: bool = False, + **dialect_options, + ): + """Initialize engine. + + :param target_dialect: + Target SQL dialect ('duckdb', 'sqlite', 'standard') + :param connection: + Existing database connection (optional) + :param db_path: + Database path or connection string + :param verbose: + Print transpiled SQL + :param dialect_options: + Additional options for specific dialects + """ + self.target_dialect = target_dialect + self.verbose = verbose + self.schema_info = SchemaInfo() + self.dialect_options = dialect_options + + # Initialize connection + if connection: + self.conn = connection + self.owns_connection = False + else: + self.conn = self._create_connection(db_path) + self.owns_connection = True + + # Get appropriate generator + self.generator = self._get_generator() + + # Initialize query transformers + self.cluster_transformer = ClusterTransformer(self.schema_info) + self.merge_transformer = MergeTransformer(self.schema_info) + + def _create_connection(self, db_path: str): + """Create database connection based on target dialect. + + :param db_path: + Path to database file or connection string + :return: + Connection object for the specified database backend + :raises ImportError: + If the required database driver is not installed + :raises ValueError: + If the dialect is not supported + """ + if self.target_dialect == "duckdb": + try: + import duckdb + + return duckdb.connect(db_path) + except ImportError: + raise ImportError("DuckDB not installed.") + + elif self.target_dialect == "sqlite": + import sqlite3 + + return sqlite3.connect(db_path) + + else: + raise ValueError( + f"Unsupported dialect: {self.target_dialect}. Supported: duckdb, sqlite" + ) + + def _get_generator(self): + """Get generator for target dialect. + + :return: + SQL generator instance configured for the target dialect + """ + generators = { + "duckdb": GIQLDuckDBGenerator, + "sqlite": BaseGIQLGenerator, + "standard": BaseGIQLGenerator, + } + + generator_class = generators.get(self.target_dialect, BaseGIQLGenerator) + return generator_class(schema_info=self.schema_info, **self.dialect_options) + + def register_table_schema( + self, + table_name: str, + columns: dict[str, str], + genomic_column: str = DEFAULT_GENOMIC_COL, + chrom_col: str = DEFAULT_CHROM_COL, + start_col: str = DEFAULT_START_COL, + end_col: str = DEFAULT_END_COL, + strand_col: str | None = DEFAULT_STRAND_COL, + coordinate_system: str = "0based", + interval_type: str = "half_open", + ): + """Register schema for a table. + + This method tells the engine how genomic ranges are stored in the table, + mapping logical genomic column names to physical column names. + + :param table_name: + Table name + :param columns: + Dict of column_name -> type + :param genomic_column: + Logical name for genomic position + :param chrom_col: + Physical chromosome column + :param start_col: + Physical start position column + :param end_col: + Physical end position column + :param strand_col: + Physical strand column (optional) + :param coordinate_system: + Coordinate system: "0based" or "1based" (default: "0based") + :param interval_type: + Interval endpoint handling: "half_open" or "closed" (default: "half_open") + """ + # Convert string parameters to enums + coord_sys = ( + CoordinateSystem.ONE_BASED + if coordinate_system == "1based" + else CoordinateSystem.ZERO_BASED + ) + int_type = ( + IntervalType.CLOSED if interval_type == "closed" else IntervalType.HALF_OPEN + ) + + column_infos = {} + + for col_name, col_type in columns.items(): + column_infos[col_name] = ColumnInfo( + name=col_name, type=col_type, is_genomic=False + ) + + # Add virtual genomic column with mappings to physical columns + column_infos[genomic_column] = ColumnInfo( + name=genomic_column, + type="GENOMIC_RANGE", # Virtual type + is_genomic=True, + chrom_col=chrom_col, + start_col=start_col, + end_col=end_col, + strand_col=strand_col, + coordinate_system=coord_sys, + interval_type=int_type, + ) + + table_schema = TableSchema(table_name, column_infos) + self.schema_info.register_table(table_name, table_schema) + + def load_csv(self, table_name: str, file_path: str): + """Load CSV file into database. + + :param table_name: + Name to assign to the table + :param file_path: + Path to the CSV file + """ + if self.target_dialect == "duckdb": + self.conn.execute( + f"CREATE TABLE {table_name} " + f"AS SELECT * FROM read_csv_auto('{file_path}')" + ) + elif self.target_dialect == "sqlite": + # Use pandas for SQLite + df = pd.read_csv(file_path) + df.to_sql(table_name, self.conn, if_exists="replace", index=False) + + if self.verbose: + print(f"Loaded {table_name} from {file_path}") + + def load_parquet(self, table_name: str, file_path: str): + """Load Parquet file into database. + + :param table_name: + Name to assign to the table + :param file_path: + Path to the Parquet file + """ + if self.target_dialect == "duckdb": + self.conn.execute( + f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')" + ) + else: + df = pd.read_parquet(file_path) + df.to_sql(table_name, self.conn, if_exists="replace", index=False) + + if self.verbose: + print(f"Loaded {table_name} from {file_path}") + + def transpile(self, giql: str) -> str: + """Transpile a GIQL query to the engine's target SQL dialect. + + Parses the GIQL syntax and transpiles it to the target SQL dialect + without executing it. Useful for debugging or generating SQL for + external use. + + :param giql: + Query string with GIQL genomic extensions + :return: + Transpiled SQL query string in the target dialect + :raises ValueError: + If the query cannot be parsed or transpiled + """ + # Parse with GIQL dialect + try: + ast = parse_one(giql, dialect=GIQLDialect) + except Exception as e: + raise ValueError(f"Parse error: {e}\nQuery: {giql}") + + # Transform query (MERGE first, then CLUSTER) + try: + # Apply MERGE transformation (which may internally use CLUSTER) + ast = self.merge_transformer.transform(ast) + # Apply CLUSTER transformation for any standalone CLUSTER expressions + ast = self.cluster_transformer.transform(ast) + except Exception as e: + raise ValueError(f"Transformation error: {e}") + + # Transpile to target dialect + try: + target_sql = self.generator.generate(ast) + except Exception as e: + raise ValueError(f"Transpilation error: {e}") + + if self.verbose: + print(f"\n{'=' * 60}") + print(f"Target Dialect: {self.target_dialect}") + print("\nOriginal GIQL:") + print(giql) + print("\nTranspiled SQL:") + print(target_sql) + print(f"{'=' * 60}\n") + + return target_sql + + def execute(self, giql: str) -> CursorLike: + """Execute a GIQL query and return a database cursor. + + Parses the GIQL syntax, transpiles to target SQL dialect, + and executes the query returning a cursor for lazy iteration. + + :param giql: + Query string with GIQL genomic extensions + :return: + Database cursor (DB-API 2.0 compatible) that can be iterated + :raises ValueError: + If the query cannot be parsed, transpiled, or executed + """ + # Transpile GIQL to target SQL + target_sql = self.transpile(giql) + + # Execute and return cursor + try: + return self.conn.execute(target_sql) + except Exception as e: + raise ValueError(f"Execution error: {e}\nSQL: {target_sql}") + + def execute_raw(self, sql: str) -> pd.DataFrame: + """Execute raw SQL directly, bypassing GIQL parsing. + + :param sql: + Raw SQL query string + :return: + Query results as a pandas DataFrame + """ + return pd.read_sql(sql, self.conn) + + def close(self): + """Close database connection. + + Only closes connections created by the engine. If an external + connection was provided during initialization, it is not closed. + """ + if self.owns_connection and self.conn: + self.conn.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): # noqa: ANN001 + self.close() diff --git a/src/giql/expressions.py b/src/giql/expressions.py new file mode 100644 index 0000000..60801eb --- /dev/null +++ b/src/giql/expressions.py @@ -0,0 +1,276 @@ +"""Custom AST expression nodes for genomic operations. + +This module defines custom SQLGlot expression nodes for GIQL spatial operators. +""" + +from sqlglot import exp + + +class GenomicRange(exp.Expression): + """Represents a parsed genomic range. + + Examples: + 'chr1:1000-2000' + 'chr1:[1000,2000)' + 'chr1:[1001,2000]' + """ + + arg_types = { + "chromosome": True, + "start": True, + "end": True, + "strand": False, + "coord_system": False, + } + + +class SpatialPredicate(exp.Binary): + """Base class for spatial predicates.""" + + pass + + +class Intersects(SpatialPredicate): + """INTERSECTS spatial predicate. + + Example: column INTERSECTS 'chr1:1000-2000' + """ + + pass + + +class Contains(SpatialPredicate): + """CONTAINS spatial predicate. + + Example: column CONTAINS 'chr1:1500' + """ + + pass + + +class Within(SpatialPredicate): + """WITHIN spatial predicate. + + Example: column WITHIN 'chr1:1000-5000' + """ + + pass + + +class SpatialSetPredicate(exp.Expression): + """Spatial predicates with set quantifiers. + + Examples: + column INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + column CONTAINS ALL('chr1:1500', 'chr1:1600') + """ + + arg_types = { + "this": True, + "operator": True, + "quantifier": True, + "ranges": True, + } + + +class GIQLCluster(exp.Func): + """CLUSTER window function for assigning cluster IDs to overlapping intervals. + + Implicitly partitions by chromosome and orders by start position. + + Examples: + CLUSTER(interval) + CLUSTER(interval, 1000) + CLUSTER(interval, stranded=true) + CLUSTER(interval, 1000, stranded=true) + """ + + arg_types = { + "this": True, # genomic column + "distance": False, # maximum distance between features + "stranded": False, # strand-specific clustering + } + + @classmethod + def from_arg_list(cls, args): + """Parse argument list, handling named parameters. + + :param args: + List of arguments from parser + :return: + GIQLCluster instance with properly mapped arguments + """ + kwargs = {} + positional_args = [] + + # Separate named (EQ) and positional arguments + for arg in args: + if isinstance(arg, exp.EQ): + # Named parameter: extract name and value + param_name = ( + arg.this.name if isinstance(arg.this, exp.Column) else str(arg.this) + ) + kwargs[param_name.lower()] = arg.expression + else: + positional_args.append(arg) + + # Map positional arguments + if len(positional_args) > 0: + kwargs["this"] = positional_args[0] + if len(positional_args) > 1: + kwargs["distance"] = positional_args[1] + + return cls(**kwargs) + + +class GIQLMerge(exp.Func): + """MERGE aggregate function for combining overlapping intervals. + + Merges overlapping or bookended intervals into single intervals. + Built on top of CLUSTER operation. + + Examples: + MERGE(interval) + MERGE(interval, 1000) + MERGE(interval, stranded=true) + """ + + arg_types = { + "this": True, # genomic column + "distance": False, # maximum distance between features + "stranded": False, # strand-specific merging + } + + @classmethod + def from_arg_list(cls, args): + """Parse argument list, handling named parameters. + + :param args: List of arguments from parser + :return: GIQLMerge instance with properly mapped arguments + """ + kwargs = {} + positional_args = [] + + # Separate named (EQ) and positional arguments + for arg in args: + if isinstance(arg, exp.EQ): + # Named parameter: extract name and value + param_name = ( + arg.this.name if isinstance(arg.this, exp.Column) else str(arg.this) + ) + kwargs[param_name.lower()] = arg.expression + else: + positional_args.append(arg) + + # Map positional arguments + if len(positional_args) > 0: + kwargs["this"] = positional_args[0] + if len(positional_args) > 1: + kwargs["distance"] = positional_args[1] + + return cls(**kwargs) + + +class GIQLDistance(exp.Func): + """DISTANCE function for calculating genomic distances between intervals. + + Generates SQL CASE expression that computes distance between two genomic + intervals, with optional strand-specific and signed (directional) modes. + + Examples: + DISTANCE(a.interval, b.interval) + DISTANCE(a.interval, 'chr1:1000-2000') + DISTANCE(a.interval, b.interval, stranded=true) + DISTANCE(a.interval, b.interval, signed=true) + DISTANCE(a.interval, b.interval, stranded=true, signed=true) + """ + + arg_types = { + "this": True, # Required: interval_a (column ref or literal range) + "expression": True, # Required: interval_b (column ref or literal range) + "stranded": False, # Optional: boolean for strand-specific distance + "signed": False, # Optional: boolean for directional distance + } + + @classmethod + def from_arg_list(cls, args): + """Parse argument list, handling named parameters. + + :param args: + List of arguments from parser + :return: + GIQLDistance instance with properly mapped arguments + """ + kwargs = {} + positional_args = [] + + # Separate named (EQ) and positional arguments + for arg in args: + if isinstance(arg, exp.EQ): + # Named parameter: extract name and value + param_name = ( + arg.this.name if isinstance(arg.this, exp.Column) else str(arg.this) + ) + kwargs[param_name.lower()] = arg.expression + else: + positional_args.append(arg) + + # Map positional arguments + if len(positional_args) >= 1: + kwargs["this"] = positional_args[0] + if len(positional_args) >= 2: + kwargs["expression"] = positional_args[1] + + return cls(**kwargs) + + +class GIQLNearest(exp.Func): + """NEAREST function for finding k-nearest genomic features. + + Generates SQL for k-nearest neighbor queries using LATERAL joins + (PostgreSQL/DuckDB) or window functions (SQLite). + + Examples: + NEAREST(genes, k=3) + NEAREST(genes, reference=peaks.interval, k=5) + NEAREST(genes, reference='chr1:1000-2000', k=3) + NEAREST(genes, k=5, max_distance=100000, stranded=true) + """ + + arg_types = { + "this": True, # Required: target table name + "reference": False, # Optional: position reference (column or literal) + "k": False, # Optional: number of neighbors (default=1) + "max_distance": False, # Optional: distance threshold + "stranded": False, # Optional: strand-specific search + "signed": False, # Optional: directional distance + } + + @classmethod + def from_arg_list(cls, args): + """Parse argument list, handling named parameters. + + :param args: + List of arguments from parser + :return: + GIQLNearest instance with properly mapped arguments + """ + kwargs = {} + positional_args = [] + + # Separate named (EQ) and positional arguments + for arg in args: + if isinstance(arg, exp.EQ): + # Named parameter: extract name and value + param_name = ( + arg.this.name if isinstance(arg.this, exp.Column) else str(arg.this) + ) + kwargs[param_name.lower()] = arg.expression + else: + positional_args.append(arg) + + # Map positional arguments + if len(positional_args) >= 1: + kwargs["this"] = positional_args[0] + + return cls(**kwargs) diff --git a/src/giql/generators/__init__.py b/src/giql/generators/__init__.py new file mode 100644 index 0000000..b04bd93 --- /dev/null +++ b/src/giql/generators/__init__.py @@ -0,0 +1,9 @@ +""" +SQL generators for different database dialects. +""" + +from giql.generators.base import BaseGIQLGenerator +from giql.generators.duckdb import GIQLDuckDBGenerator +from giql.generators.sqlite import GIQLSQLiteGenerator + +__all__ = ["BaseGIQLGenerator", "GIQLDuckDBGenerator", "GIQLSQLiteGenerator"] diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py new file mode 100644 index 0000000..8cf0923 --- /dev/null +++ b/src/giql/generators/base.py @@ -0,0 +1,871 @@ +"""Base generator that outputs standard SQL. + +Works with any SQL database that supports: +- Basic comparison operators (<, >, =, AND, OR) +- String literals +- Numeric comparisons + +This generator uses only SQL-92 compatible constructs, ensuring compatibility +with virtually all SQL databases. +""" + +from typing import Optional + +from sqlglot import exp +from sqlglot.generator import Generator + +from giql.constants import DEFAULT_CHROM_COL +from giql.constants import DEFAULT_END_COL +from giql.constants import DEFAULT_START_COL +from giql.constants import DEFAULT_STRAND_COL +from giql.expressions import Contains +from giql.expressions import GIQLDistance +from giql.expressions import GIQLNearest +from giql.expressions import Intersects +from giql.expressions import SpatialSetPredicate +from giql.expressions import Within +from giql.range_parser import ParsedRange +from giql.range_parser import RangeParser +from giql.schema import SchemaInfo + + +class BaseGIQLGenerator(Generator): + """Base generator for standard SQL output. + + This generator uses only SQL-92 compatible constructs, + ensuring compatibility with virtually all SQL databases. + """ + + # Most databases support LATERAL joins (PostgreSQL 9.3+, DuckDB 0.7.0+) + # SQLite does not support LATERAL, so it overrides this to False + SUPPORTS_LATERAL = True + + def __init__(self, schema_info: Optional[SchemaInfo] = None, **kwargs): + super().__init__(**kwargs) + self.schema_info = schema_info or SchemaInfo() + self._current_table = None # Track current table for column resolution + self._alias_to_table = {} # Map aliases to table names + + def select_sql(self, expression: exp.Select) -> str: + """Override SELECT generation to track table context and aliases.""" + # Build alias-to-table mapping + self._alias_to_table = {} + + # Extract from FROM clause + if expression.args.get("from_"): + from_clause = expression.args["from_"] + if isinstance(from_clause.this, exp.Table): + table_name = from_clause.this.name + self._current_table = table_name + # Check if table has an alias + if from_clause.this.alias: + self._alias_to_table[from_clause.this.alias] = table_name + else: + # No alias, table referenced by name + self._alias_to_table[table_name] = table_name + + # Extract from JOINs + if expression.args.get("joins"): + for join in expression.args["joins"]: + if isinstance(join.this, exp.Table): + table_name = join.this.name + # Check if table has an alias + if join.this.alias: + self._alias_to_table[join.this.alias] = table_name + else: + self._alias_to_table[table_name] = table_name + + # Call parent implementation + return super().select_sql(expression) + + def intersects_sql(self, expression: Intersects) -> str: + """Generate standard SQL for INTERSECTS. + + :param expression: + INTERSECTS expression node + :return: + SQL predicate string + """ + return self._generate_spatial_op(expression, "intersects") + + def contains_sql(self, expression: Contains) -> str: + """Generate standard SQL for CONTAINS. + + :param expression: + CONTAINS expression node + :return: + SQL predicate string + """ + return self._generate_spatial_op(expression, "contains") + + def within_sql(self, expression: Within) -> str: + """Generate standard SQL for WITHIN. + + :param expression: + WITHIN expression node + :return: + SQL predicate string + """ + return self._generate_spatial_op(expression, "within") + + def spatialsetpredicate_sql(self, expression: SpatialSetPredicate) -> str: + """Generate SQL for spatial set predicates (ANY/ALL). + + :param expression: + SpatialSetPredicate expression node + :return: + SQL predicate string + """ + return self._generate_spatial_set(expression) + + def giqlnearest_sql(self, expression: GIQLNearest) -> str: + """Generate SQL for NEAREST function. + + Detects mode (standalone vs correlated) and generates appropriate SQL: + - Standalone: Direct query with ORDER BY + LIMIT + - Correlated (LATERAL): Subquery for k-nearest neighbors + + :param expression: + GIQLNearest expression node + :return: + SQL string for NEAREST operation + """ + # Detect mode + mode = self._detect_nearest_mode(expression) + + # Resolve target table + table_name, (target_chrom, target_start, target_end) = ( + self._resolve_target_table(expression) + ) + + # Resolve reference + ref_chrom, ref_start, ref_end = self._resolve_nearest_reference(expression, mode) + + # Extract parameters + k = expression.args.get("k") + k_value = int(str(k)) if k else 1 # Default k=1 + + max_distance = expression.args.get("max_distance") + max_dist_value = int(str(max_distance)) if max_distance else None + + stranded = expression.args.get("stranded") + is_stranded = stranded and str(stranded).lower() in ("true", "1") + + # Resolve strand columns if stranded mode + ref_strand = None + target_strand = None + if is_stranded: + # Get strand column for reference + reference = expression.args.get("reference") + if reference: + reference_sql = self.sql(reference) + + # Check if reference is a literal string or column reference + if reference_sql.startswith("'") or reference_sql.startswith('"'): + # Literal reference - parse for strand + range_str = reference_sql.strip("'\"") + from giql.range_parser import RangeParser + + parsed_range = RangeParser.parse(range_str).to_zero_based_half_open() + if parsed_range.strand: + ref_strand = f"'{parsed_range.strand}'" + else: + # Column reference - get strand column + ref_cols = self._get_column_refs( + reference_sql, None, include_strand=True + ) + if len(ref_cols) == 4: + ref_strand = ref_cols[3] + else: + # Implicit reference in correlated mode - get strand from outer table + outer_table = self._find_outer_table_in_lateral_join(expression) + if outer_table and self.schema_info: + actual_table = self._alias_to_table.get(outer_table, outer_table) + table_schema = self.schema_info.get_table(actual_table) + if table_schema: + for col_info in table_schema.columns.values(): + if col_info.is_genomic and col_info.strand_col: + ref_strand = f'{outer_table}."{col_info.strand_col}"' + break + + # Get strand column for target table + target_table_info = ( + self.schema_info.get_table(table_name) if self.schema_info else None + ) + if target_table_info: + for col_info in target_table_info.columns.values(): + if col_info.is_genomic and col_info.strand_col: + target_strand = f'{table_name}."{col_info.strand_col}"' + break + + # Determine if we should add 1 for gap distances (bedtools compatibility) + # This depends on the interval types of the tables involved + add_one = False + if self.schema_info: + target_table_info = self.schema_info.get_table(table_name) + if target_table_info: + for col_info in target_table_info.columns.values(): + if col_info.is_genomic: + # Import IntervalType to check + from giql.range_parser import IntervalType + + # Add 1 for closed intervals (bedtools behavior) + if col_info.interval_type == IntervalType.CLOSED: + add_one = True + break + + # Build distance calculation using CASE expression + # For NEAREST: ORDER BY absolute distance, but RETURN signed distance + distance_expr = self._generate_distance_case( + ref_chrom, + ref_start, + ref_end, + ref_strand, + f'{table_name}."{target_chrom}"', + f'{table_name}."{target_start}"', + f'{table_name}."{target_end}"', + target_strand, + stranded=is_stranded, + add_one_for_gap=add_one, + ) + + # Use absolute distance for ordering and filtering + abs_distance_expr = f"ABS({distance_expr})" + + # Build WHERE clauses + where_clauses = [ + f'{ref_chrom} = {table_name}."{target_chrom}"' # Chromosome pre-filter + ] + + # Add strand matching for stranded mode + if is_stranded and ref_strand and target_strand: + where_clauses.append(f"{ref_strand} = {target_strand}") + + if max_dist_value is not None: + where_clauses.append(f"({abs_distance_expr}) <= {max_dist_value}") + + where_sql = " AND ".join(where_clauses) + + # Generate SQL based on mode + if mode == "standalone": + # Standalone mode: direct ORDER BY + LIMIT + # Return signed distance, but order by absolute distance + sql = f"""( + SELECT {table_name}.*, {distance_expr} AS distance + FROM {table_name} + WHERE {where_sql} + ORDER BY {abs_distance_expr} + LIMIT {k_value} + )""" + else: + # Correlated mode: requires LATERAL join support + if not self.SUPPORTS_LATERAL: + raise ValueError( + "NEAREST in correlated mode (CROSS JOIN LATERAL) is not supported " + "in SQLite. SQLite does not support LATERAL joins. " + "\n\nAlternatives:" + "\n1. Use standalone mode: SELECT * FROM NEAREST(table, " + "reference='chr1:100-200', k=3)" + "\n2. Use DuckDB for queries requiring LATERAL joins" + "\n3. Manually write equivalent window function query" + ) + + # LATERAL mode: subquery for k-nearest neighbors + # Return signed distance, but order by absolute distance + sql = f"""( + SELECT {table_name}.*, {distance_expr} AS distance + FROM {table_name} + WHERE {where_sql} + ORDER BY {abs_distance_expr} + LIMIT {k_value} + )""" + + return sql.strip() + + def giqldistance_sql(self, expression: GIQLDistance) -> str: + """Generate SQL CASE expression for DISTANCE function. + + :param expression: + GIQLDistance expression node + :return: + SQL CASE expression string calculating genomic distance + """ + # Extract the two interval arguments + interval_a = expression.this + interval_b = expression.args.get("expression") + + # Extract stranded parameter + stranded_expr = expression.args.get("stranded") + stranded = False + if stranded_expr: + if isinstance(stranded_expr, exp.Boolean): + stranded = stranded_expr.this + elif isinstance(stranded_expr, exp.Literal): + stranded = str(stranded_expr.this).upper() == "TRUE" + else: + stranded = str(stranded_expr).upper() in ("TRUE", "1", "YES") + + # Get SQL representations + interval_a_sql = self.sql(interval_a) + interval_b_sql = self.sql(interval_b) + + # Check if we're dealing with column-to-column or column-to-literal + if "." in interval_a_sql and not interval_a_sql.startswith("'"): + # Column reference for interval_a + if stranded: + chrom_a, start_a, end_a, strand_a = self._get_column_refs( + interval_a_sql, None, include_strand=True + ) + else: + chrom_a, start_a, end_a = self._get_column_refs(interval_a_sql, None) + strand_a = None + else: + # Literal range - not implemented yet for interval_a + raise ValueError("Literal range as first argument not yet supported") + + if "." in interval_b_sql and not interval_b_sql.startswith("'"): + # Column reference for interval_b + if stranded: + chrom_b, start_b, end_b, strand_b = self._get_column_refs( + interval_b_sql, None, include_strand=True + ) + else: + chrom_b, start_b, end_b = self._get_column_refs(interval_b_sql, None) + strand_b = None + else: + # Literal range - not implemented yet + raise ValueError("Literal range as second argument not yet supported") + + # Determine if we should add 1 for gap distances (bedtools compatibility) + # Check interval types from schema + add_one = False + if self.schema_info: + # Extract table names from column references + # Column refs look like "table.column" or "alias.column" + table_a = interval_a_sql.split(".")[0] if "." in interval_a_sql else None + table_b = interval_b_sql.split(".")[0] if "." in interval_b_sql else None + + # Check if either table uses closed intervals + from giql.range_parser import IntervalType + + for table_name in [table_a, table_b]: + if table_name: + # Remove quotes if present + table_name = table_name.strip('"') + # Check if it's an alias first + actual_table = self._alias_to_table.get(table_name, table_name) + table_info = self.schema_info.get_table(actual_table) + if table_info: + for col_info in table_info.columns.values(): + if col_info.is_genomic: + if col_info.interval_type == IntervalType.CLOSED: + add_one = True + break + + # Generate CASE expression + return self._generate_distance_case( + chrom_a, + start_a, + end_a, + strand_a, + chrom_b, + start_b, + end_b, + strand_b, + stranded=stranded, + add_one_for_gap=add_one, + ) + + def _generate_distance_case( + self, + chrom_a: str, + start_a: str, + end_a: str, + strand_a: str | None, + chrom_b: str, + start_b: str, + end_b: str, + strand_b: str | None, + stranded: bool = False, + add_one_for_gap: bool = False, + ) -> str: + """Generate SQL CASE expression for distance calculation. + + :param chrom_a: Chromosome column for interval A + :param start_a: Start column for interval A + :param end_a: End column for interval A + :param strand_a: Strand column for interval A (None if not stranded) + :param chrom_b: Chromosome column for interval B + :param start_b: Start column for interval B + :param end_b: End column for interval B + :param strand_b: Strand column for interval B (None if not stranded) + :param stranded: Whether to use strand-aware distance calculation + :param add_one_for_gap: Whether to add 1 to non-overlapping distance (bedtools compatibility) + :return: SQL CASE expression + """ + # Distance adjustment for non-overlapping intervals + gap_adj = " + 1" if add_one_for_gap else "" + + if not stranded or strand_a is None or strand_b is None: + # Basic distance calculation without strand awareness + return f"""CASE WHEN {chrom_a} != {chrom_b} THEN NULL WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 WHEN {end_a} <= {start_b} THEN ({start_b} - {end_a}{gap_adj}) ELSE ({start_a} - {end_b}{gap_adj}) END""" + + # Stranded distance calculation + # Return NULL if either strand is '.', '?', or NULL + # Calculate distance and multiply by -1 if first interval is on '-' strand + return f"""CASE WHEN {chrom_a} != {chrom_b} THEN NULL WHEN {strand_a} IS NULL OR {strand_b} IS NULL THEN NULL WHEN {strand_a} = '.' OR {strand_a} = '?' THEN NULL WHEN {strand_b} = '.' OR {strand_b} = '?' THEN NULL WHEN {start_a} < {end_b} AND {end_a} > {start_b} THEN 0 WHEN {end_a} <= {start_b} THEN CASE WHEN {strand_a} = '-' THEN -({start_b} - {end_a}{gap_adj}) ELSE ({start_b} - {end_a}{gap_adj}) END ELSE CASE WHEN {strand_a} = '-' THEN -({start_a} - {end_b}{gap_adj}) ELSE ({start_a} - {end_b}{gap_adj}) END END""" + + def _generate_spatial_op(self, expression: exp.Binary, op_type: str) -> str: + """Generate SQL for a spatial operation. + + :param expression: + AST node (Intersects, Contains, or Within) + :param op_type: + 'intersects', 'contains', or 'within' + :return: + SQL predicate string + """ + left = self.sql(expression, "this") + right_raw = self.sql(expression, "expression") + + # Check if right side is a column reference or a literal range string + if "." in right_raw and not right_raw.startswith("'"): + # Column-to-column join (e.g., a.interval INTERSECTS b.interval) + return self._generate_column_join(left, right_raw, op_type) + else: + # Literal range string (e.g., interval INTERSECTS 'chr1:1000-2000') + try: + range_str = right_raw.strip("'\"") + parsed_range = RangeParser.parse(range_str).to_zero_based_half_open() + return self._generate_range_predicate(left, parsed_range, op_type) + except Exception as e: + raise ValueError( + f"Could not parse genomic range: {right_raw}. Error: {e}" + ) + + def _generate_range_predicate( + self, + column_ref: str, + parsed_range: ParsedRange, + op_type: str, + ) -> str: + """Generate SQL predicate for a range operation. + + :param column_ref: + Column reference (e.g., 'v.interval' or 'interval') + :param parsed_range: + Parsed genomic range + :param op_type: + 'intersects', 'contains', or 'within' + :return: + SQL predicate string + """ + # Get column references + chrom_col, start_col, end_col = self._get_column_refs( + column_ref, self._current_table + ) + + chrom = parsed_range.chromosome + start = parsed_range.start + end = parsed_range.end + + if op_type == "intersects": + # Ranges overlap if: start1 < end2 AND end1 > start2 + return ( + f"({chrom_col} = '{chrom}' " + f"AND {start_col} < {end} " + f"AND {end_col} > {start})" + ) + + elif op_type == "contains": + # Point query: start1 <= point < end1 + if end == start + 1: + return ( + f"({chrom_col} = '{chrom}' " + f"AND {start_col} <= {start} " + f"AND {end_col} > {start})" + ) + # Range query: start1 <= start2 AND end1 >= end2 + else: + return ( + f"({chrom_col} = '{chrom}' " + f"AND {start_col} <= {start} " + f"AND {end_col} >= {end})" + ) + + elif op_type == "within": + # Left within right: start1 >= start2 AND end1 <= end2 + return ( + f"({chrom_col} = '{chrom}' " + f"AND {start_col} >= {start} " + f"AND {end_col} <= {end})" + ) + + raise ValueError(f"Unknown operation: {op_type}") + + def _generate_column_join(self, left_col: str, right_col: str, op_type: str) -> str: + """Generate SQL for column-to-column spatial joins. + + :param left_col: + Left column reference (e.g., 'a.interval') + :param right_col: + Right column reference (e.g., 'b.interval') + :param op_type: + 'intersects', 'contains', or 'within' + :return: + SQL predicate string + """ + # Get column references for both sides + # Pass None to let _get_column_refs extract and resolve table from column ref + l_chrom, l_start, l_end = self._get_column_refs(left_col, None) + r_chrom, r_start, r_end = self._get_column_refs(right_col, None) + + if op_type == "intersects": + # Ranges overlap if: chrom1 = chrom2 AND start1 < end2 AND end1 > start2 + return ( + f"({l_chrom} = {r_chrom} " + f"AND {l_start} < {r_end} " + f"AND {l_end} > {r_start})" + ) + + elif op_type == "contains": + # Left contains right: chrom1 = chrom2 AND start1 <= start2 AND end1 >= end2 + return ( + f"({l_chrom} = {r_chrom} " + f"AND {l_start} <= {r_start} " + f"AND {l_end} >= {r_end})" + ) + + elif op_type == "within": + # Left within right: chrom1 = chrom2 AND start1 >= start2 AND end1 <= end2 + return ( + f"({l_chrom} = {r_chrom} " + f"AND {l_start} >= {r_start} " + f"AND {l_end} <= {r_end})" + ) + + raise ValueError(f"Unknown operation: {op_type}") + + def _generate_spatial_set(self, expression: SpatialSetPredicate) -> str: + """Generate SQL for spatial set predicates (ANY/ALL). + + Examples: + column INTERSECTS ANY(...) -> (condition1 OR condition2 OR ...) + column INTERSECTS ALL(...) -> (condition1 AND condition2 AND ...) + + :param expression: + SpatialSetPredicate expression node + :return: + SQL predicate string + """ + column_ref = self.sql(expression, "this") + operator = expression.args["operator"] + quantifier = expression.args["quantifier"] + ranges = expression.args["ranges"] + + # Parse all ranges + parsed_ranges = [] + for range_expr in ranges: + range_str = self.sql(range_expr).strip("'\"") + parsed_range = RangeParser.parse(range_str).to_zero_based_half_open() + parsed_ranges.append(parsed_range) + + op_type = operator.lower() + + # Generate conditions for each range + conditions = [] + for parsed_range in parsed_ranges: + condition = self._generate_range_predicate(column_ref, parsed_range, op_type) + conditions.append(condition) + + # Combine with AND (for ALL) or OR (for ANY) + combinator = " OR " if quantifier.upper() == "ANY" else " AND " + return "(" + combinator.join(conditions) + ")" + + def _detect_nearest_mode( + self, expression: GIQLNearest, parent_expression: Optional[exp.Expression] = None + ) -> str: + """Detect whether NEAREST is in standalone or correlated mode. + + :param expression: + GIQLNearest expression node + :param parent_expression: + Parent AST node (optional, used to detect LATERAL context) + :return: + "standalone" or "correlated" + """ + # Check if reference parameter is explicitly provided + reference = expression.args.get("reference") + + if reference: + # Explicit reference means standalone mode + return "standalone" + + # No explicit reference - check for LATERAL context + # In correlated mode, NEAREST appears in a LATERAL join context + # For now, default to correlated mode if no reference specified + # (validation will catch missing reference errors later) + return "correlated" + + def _find_outer_table_in_lateral_join( + self, expression: GIQLNearest + ) -> Optional[str]: + """Find the outer table name in a LATERAL join context. + + Walks up the AST to find the JOIN clause and extracts the outer table + that the LATERAL subquery is correlated with. + + :param expression: + GIQLNearest expression node + :return: + Table name or alias of the outer table, or None if not found + """ + # Walk up the AST to find the JOIN + current = expression + while current: + parent = current.parent + if not parent: + break + + # Check if parent is a Lateral expression + if isinstance(parent, exp.Lateral): + # Continue up to find the Join + current = parent + continue + + # Check if parent is a Join + if isinstance(parent, exp.Join): + # The outer table is in the parent Select's FROM clause + # or in previous joins + select = parent.parent + if isinstance(select, exp.Select): + # Get the FROM clause + from_expr = select.args.get("from_") + if from_expr: + # Extract table from FROM + table_expr = from_expr.this + if isinstance(table_expr, exp.Table): + # Return alias if it exists, otherwise table name + return table_expr.alias or table_expr.name + elif isinstance(table_expr, exp.Alias): + return table_expr.alias + break + + current = parent + + return None + + def _resolve_nearest_reference( + self, expression: GIQLNearest, mode: str + ) -> tuple[str, str, str] | tuple[str, str, str, str]: + """Resolve the reference position for NEAREST queries. + + :param expression: + GIQLNearest expression node + :param mode: + "standalone" or "correlated" + :return: + Tuple of (chromosome, start, end) or (chromosome, start, end, strand) + Returns SQL expressions (column refs for correlated, literals for standalone) + :raises ValueError: + If reference is missing in standalone mode or invalid format + """ + reference = expression.args.get("reference") + + if mode == "standalone": + if not reference: + raise ValueError( + "NEAREST in standalone mode requires explicit reference parameter" + ) + + # Get SQL representation of reference + reference_sql = self.sql(reference) + + # Check if it's a literal range string + if reference_sql.startswith("'") or reference_sql.startswith('"'): + # Parse literal genomic range + range_str = reference_sql.strip("'\"") + try: + parsed_range = RangeParser.parse(range_str).to_zero_based_half_open() + # Return as SQL literals + return ( + f"'{parsed_range.chromosome}'", + str(parsed_range.start), + str(parsed_range.end), + ) + except Exception as e: + raise ValueError( + f"Could not parse reference genomic range: " + f"{range_str}. Error: {e}" + ) + else: + # Column reference - resolve via _get_column_refs + return self._get_column_refs(reference_sql, None) + + else: # correlated mode + if reference: + # Explicit reference in correlated mode (e.g., peaks.interval) + reference_sql = self.sql(reference) + return self._get_column_refs(reference_sql, None) + else: + # Implicit reference - resolve from outer table in LATERAL join + outer_table = self._find_outer_table_in_lateral_join(expression) + if not outer_table: + raise ValueError( + "Could not find outer table in LATERAL join context. " + "Please specify reference parameter explicitly." + ) + + # Look up the table's schema to find the genomic column + # Check if outer_table is an alias + actual_table = self._alias_to_table.get(outer_table, outer_table) + table_schema = self.schema_info.get_table(actual_table) + + if not table_schema: + raise ValueError( + f"Outer table '{outer_table}' not found in schema. " + "Please specify reference parameter explicitly." + ) + + # Find the genomic column in the table schema + genomic_col_name = None + for col_info in table_schema.columns.values(): + if col_info.is_genomic: + genomic_col_name = col_info.name + break + + if not genomic_col_name: + raise ValueError( + f"No genomic column found in table '{outer_table}'. " + "Please specify reference parameter explicitly." + ) + + # Build column references using the outer table and genomic column + reference_sql = f"{outer_table}.{genomic_col_name}" + return self._get_column_refs(reference_sql, None) + + def _resolve_target_table( + self, expression: GIQLNearest + ) -> tuple[str, tuple[str, str, str]]: + """Resolve the target table name and its genomic column references. + + :param expression: + GIQLNearest expression node + :return: + Tuple of (table_name, (chromosome_col, start_col, end_col)) + :raises ValueError: + If target table is not found or doesn't have genomic columns + """ + # Extract target table from 'this' argument + target = expression.this + + if isinstance(target, exp.Table): + table_name = target.name + elif isinstance(target, exp.Column): + # If it's a column reference, extract table name + table_name = target.table if target.table else str(target.this) + else: + # Try to extract as string + table_name = str(target) + + # Look up table in schema + if not self.schema_info: + raise ValueError( + f"Cannot resolve target table '{table_name}': schema_info not available" + ) + + table_schema = self.schema_info.get_table(table_name) + if not table_schema: + raise ValueError( + f"Target table '{table_name}' not found in schema. " + f"Available tables: {list(self.schema_info.tables.keys())}" + ) + + # Find genomic column in target table + genomic_col = None + for col_info in table_schema.columns.values(): + if col_info.is_genomic: + genomic_col = col_info + break + + if not genomic_col: + raise ValueError( + f"Target table '{table_name}' does not have a genomic column" + ) + + # Get physical column names + chrom_col = genomic_col.chrom_col or DEFAULT_CHROM_COL + start_col = genomic_col.start_col or DEFAULT_START_COL + end_col = genomic_col.end_col or DEFAULT_END_COL + + return table_name, (chrom_col, start_col, end_col) + + def _get_column_refs( + self, + column_ref: str, + table_name: str | None = None, + include_strand: bool = False, + ) -> tuple[str, str, str] | tuple[str, str, str, str]: + """Get physical column names for genomic data. + + :param column_ref: + Logical column reference (e.g., 'v.interval' or 'interval') + :param table_name: + Table name to look up schema (optional, overrides extraction from column_ref) + :param include_strand: + If True, return 4-tuple with strand column; otherwise return 3-tuple + :return: + Tuple of (chromosome_col, start_col, end_col) or + (chromosome_col, start_col, end_col, strand_col) if include_strand=True + """ + # Default column names + chrom_col = DEFAULT_CHROM_COL + start_col = DEFAULT_START_COL + end_col = DEFAULT_END_COL + strand_col = DEFAULT_STRAND_COL + + # Extract table alias/name from column reference if present + table_alias = None + if "." in column_ref: + table_alias, _ = column_ref.rsplit(".", 1) + # If no explicit table_name provided, resolve alias to table name + if not table_name: + # Look up actual table name from alias + table_name = self._alias_to_table.get(table_alias, self._current_table) + + # Try to get custom column names from schema + if table_name and self.schema_info: + table_schema = self.schema_info.get_table(table_name) + if table_schema: + # Find the genomic column + for col_info in table_schema.columns.values(): + if col_info.is_genomic: + if col_info.chrom_col: + chrom_col = col_info.chrom_col + if col_info.start_col: + start_col = col_info.start_col + if col_info.end_col: + end_col = col_info.end_col + if col_info.strand_col: + strand_col = col_info.strand_col + break + + # Format with table alias if present + if table_alias: + base_cols = ( + f'{table_alias}."{chrom_col}"', + f'{table_alias}."{start_col}"', + f'{table_alias}."{end_col}"', + ) + if include_strand: + return base_cols + (f'{table_alias}."{strand_col}"',) + return base_cols + else: + base_cols = ( + f'"{chrom_col}"', + f'"{start_col}"', + f'"{end_col}"', + ) + if include_strand: + return base_cols + (f'"{strand_col}"',) + return base_cols diff --git a/src/giql/generators/duckdb.py b/src/giql/generators/duckdb.py new file mode 100644 index 0000000..d68c3fd --- /dev/null +++ b/src/giql/generators/duckdb.py @@ -0,0 +1,22 @@ +"""DuckDB-specific generator with optimizations. + +This module provides DuckDB-specific optimizations for GIQL query generation. +""" + +from sqlglot.dialects.duckdb import DuckDB + +from giql.generators.base import BaseGIQLGenerator + + +class GIQLDuckDBGenerator(BaseGIQLGenerator, DuckDB.Generator): + """DuckDB-specific optimizations. + + Can leverage: + - Efficient list operations + - STRUCT types + - Columnar optimizations + """ + + def __init__(self, schema_info=None, **kwargs): + BaseGIQLGenerator.__init__(self, schema_info=schema_info, **kwargs) + DuckDB.Generator.__init__(self, **kwargs) diff --git a/src/giql/generators/sqlite.py b/src/giql/generators/sqlite.py new file mode 100644 index 0000000..7396679 --- /dev/null +++ b/src/giql/generators/sqlite.py @@ -0,0 +1,25 @@ +"""SQLite-specific generator. + +This module provides SQLite-specific SQL generation for GIQL queries. +SQLite does not support LATERAL joins, so NEAREST uses window functions instead. +""" + +from sqlglot.dialects.sqlite import SQLite + +from giql.generators.base import BaseGIQLGenerator + + +class GIQLSQLiteGenerator(BaseGIQLGenerator, SQLite.Generator): + """SQLite-specific SQL generator. + + Key differences from other dialects: + - No LATERAL join support - uses window functions for NEAREST + - Window functions available since SQLite 3.25.0 (2018-09-15) + """ + + # SQLite does not support LATERAL joins + SUPPORTS_LATERAL = False + + def __init__(self, schema_info=None, **kwargs): + BaseGIQLGenerator.__init__(self, schema_info=schema_info, **kwargs) + SQLite.Generator.__init__(self, **kwargs) diff --git a/src/giql/protocols.py b/src/giql/protocols.py new file mode 100644 index 0000000..9002051 --- /dev/null +++ b/src/giql/protocols.py @@ -0,0 +1,81 @@ +"""Protocol definitions for GIQL. + +This module defines protocols for type checking and interface compatibility. +""" + +from typing import Any +from typing import Protocol +from typing import Sequence + + +class CursorLike(Protocol): + """Protocol for DB-API 2.0 compatible cursors. + + Based on PEP 249: https://peps.python.org/pep-0249/ + + This protocol defines the minimal interface required for database cursors + that can be used with GIQL. All DB-API 2.0 compliant drivers (SQLite, + PostgreSQL, MySQL, DuckDB) implement this interface. + """ + + @property + def description( + self, + ) -> ( + Sequence[ + tuple[str, Any, Any | None, Any | None, Any | None, Any | None, Any | None] + ] + | None + ): + """Column descriptions. + + A sequence of 7-tuples describing each column: + (name, type_code, display_size, internal_size, precision, scale, null_ok) + + Only 'name' is required; other values may be None. + Returns None if no operation has been performed yet. + """ + ... + + @property + def rowcount(self) -> int: + """Number of rows affected by last operation. + + Returns -1 if no operation has been performed or if the count + cannot be determined. + """ + ... + + def fetchone(self) -> tuple[Any, ...] | None: + """Fetch the next row of a query result set. + + Returns a tuple representing the next row, or None when no more + rows are available. + """ + ... + + def fetchmany(self, size: int = 1) -> list[tuple[Any, ...]]: + """Fetch the next set of rows of a query result set. + + Returns a list of tuples. An empty list is returned when no more + rows are available. + + :param size: + Number of rows to fetch (default: 1) + """ + ... + + def fetchall(self) -> list[tuple[Any, ...]]: + """Fetch all remaining rows of a query result set. + + Returns a list of tuples. An empty list is returned when no rows + are available. + """ + ... + + def close(self) -> None: + """Close the cursor. + + Makes the cursor unusable for further operations. + """ + ... diff --git a/src/giql/range_parser.py b/src/giql/range_parser.py new file mode 100644 index 0000000..7ad1c1c --- /dev/null +++ b/src/giql/range_parser.py @@ -0,0 +1,188 @@ +"""Parse genomic range strings into structured data. + +Supported formats: + - Simple: 'chr1:1000-2000' + - Explicit half-open: 'chr1:[1000,2000)' + - Explicit closed: 'chr1:[1001,2000]' + - With strand: 'chr1:1000-2000:+' + - Points: 'chr1:1500' +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from enum import Enum +from typing import Literal +from typing import Optional + + +class CoordinateSystem(Enum): + """Coordinate system for genomic ranges.""" + + ZERO_BASED = "0based" + ONE_BASED = "1based" + + +class IntervalType(Enum): + """Interval endpoint handling.""" + + HALF_OPEN = "half_open" # [start, end) + CLOSED = "closed" # [start, end] + + +@dataclass +class ParsedRange: + """Structured representation of a genomic range.""" + + chromosome: str + start: int + end: int + interval_type: IntervalType + strand: Optional[Literal["+", "-"]] = None + + def to_zero_based_half_open(self) -> ParsedRange: + """Convert to canonical 0-based half-open representation. + + Conversions: + - Closed [1000, 1999] -> Half-open [1000, 2000) + + :return: ParsedRange in 0-based half-open format + """ + if self.interval_type == IntervalType.HALF_OPEN: + return self + + # Closed to half-open: make end exclusive + return ParsedRange( + chromosome=self.chromosome, + start=self.start, + end=self.end + 1, + interval_type=IntervalType.HALF_OPEN, + strand=self.strand, + ) + + def length(self) -> int: + """Calculate range length. + + :return: Length of the genomic range in base pairs + """ + if self.interval_type == IntervalType.HALF_OPEN: + return self.end - self.start + else: + return self.end - self.start + 1 + + +class RangeParser: + """Parse genomic range strings.""" + + # chr1:1000-2000 or chr1:1000-2000:+ + SIMPLE_PATTERN = re.compile( + r"^(?P[\w.]+):(?P\d+)-(?P\d+)(?::(?P[+-]))?$" + ) + + # chr1:[1000,2000) or chr1:[1000,2000]:+ + EXPLICIT_PATTERN = re.compile( + r"^(?P[\w.]+):\[(?P\d+),(?P\d+)(?P[\)\]])(?::(?P[+-]))?$" + ) + + # chr1:1500 + POINT_PATTERN = re.compile(r"^(?P[\w.]+):(?P\d+)$") + + @classmethod + def parse(cls, range_str: str) -> ParsedRange: + """Parse a genomic range string. + + :param range_str: String like 'chr1:1000-2000' + :return: ParsedRange object + :raises ValueError: If the string cannot be parsed + """ + range_str = range_str.strip().strip("'\"") + + # Try point format + match = cls.POINT_PATTERN.match(range_str) + if match: + return cls._parse_point(match) + + # Try explicit format + match = cls.EXPLICIT_PATTERN.match(range_str) + if match: + return cls._parse_explicit(match) + + # Try simple format + match = cls.SIMPLE_PATTERN.match(range_str) + if match: + return cls._parse_simple(match) + + raise ValueError(f"Invalid genomic range format: {range_str}") + + @classmethod + def _parse_point(cls, match) -> ParsedRange: + """Parse point format: chr1:1500 -> [1500, 1501). + + :param match: Regex match object + :return: ParsedRange representing a single base position + """ + chromosome = match.group("chr") + position = int(match.group("pos")) + + return ParsedRange( + chromosome=chromosome, + start=position, + end=position + 1, + interval_type=IntervalType.HALF_OPEN, + strand=None, + ) + + @classmethod + def _parse_explicit(cls, match) -> ParsedRange: + """Parse explicit format: chr1:[1000,2000). + + :param match: Regex match object + :return: ParsedRange with explicit interval type + :raises ValueError: If start >= end + """ + chromosome = match.group("chr") + start = int(match.group("start")) + end = int(match.group("end")) + bracket = match.group("bracket") + strand = match.group("strand") + + if start >= end: + raise ValueError(f"Start must be less than end: {start} >= {end}") + + interval_type = IntervalType.HALF_OPEN if bracket == ")" else IntervalType.CLOSED + + return ParsedRange( + chromosome=chromosome, + start=start, + end=end, + interval_type=interval_type, + strand=strand, + ) + + @classmethod + def _parse_simple(cls, match) -> ParsedRange: + """Parse simple format: chr1:1000-2000. + + :param match: + Regex match object + :return: + ParsedRange in half-open format + :raises ValueError: + If start >= end + """ + chromosome = match.group("chr") + start = int(match.group("start")) + end = int(match.group("end")) + strand = match.group("strand") + + if start >= end: + raise ValueError(f"Start must be less than end: {start} >= {end}") + + return ParsedRange( + chromosome=chromosome, + start=start, + end=end, + interval_type=IntervalType.HALF_OPEN, + strand=strand, + ) diff --git a/src/giql/schema.py b/src/giql/schema.py new file mode 100644 index 0000000..1b6e0d5 --- /dev/null +++ b/src/giql/schema.py @@ -0,0 +1,83 @@ +"""Schema information for transpilation. + +This module manages schema metadata for tables, including how genomic +ranges are physically stored in the database. +""" + +from dataclasses import dataclass +from typing import Dict +from typing import Optional + +from giql.range_parser import CoordinateSystem +from giql.range_parser import IntervalType + + +@dataclass +class ColumnInfo: + """Information about a column.""" + + name: str + type: str + is_genomic: bool = False + # For genomic columns stored as separate fields + chrom_col: Optional[str] = None + start_col: Optional[str] = None + end_col: Optional[str] = None + strand_col: Optional[str] = None + # Coordinate system configuration for genomic columns + coordinate_system: CoordinateSystem = CoordinateSystem.ZERO_BASED + interval_type: IntervalType = IntervalType.HALF_OPEN + + +@dataclass +class TableSchema: + """Schema for a table.""" + + name: str + columns: Dict[str, ColumnInfo] + + +class SchemaInfo: + """Manages schema information for transpilation. + + Tracks how genomic ranges are stored: + - Separate columns (chromosome, start_pos, end_pos) + - STRUCT types + - Custom types + """ + + def __init__(self): + self.tables: Dict[str, TableSchema] = {} + + def register_table(self, name: str, schema: TableSchema): + """Register a table schema. + + :param name: Table name + :param schema: TableSchema object + """ + self.tables[name] = schema + + def get_table(self, name: str) -> Optional[TableSchema]: + """Get table schema by name. + + :param name: + Table name + :return: + TableSchema object or None if not found + """ + return self.tables.get(name) + + def get_column_info(self, table: str, column: str) -> Optional[ColumnInfo]: + """Get column information. + + :param table: + Table name + :param column: + Column name + :return: + ColumnInfo object or None if not found + """ + table_schema = self.get_table(table) + if table_schema: + return table_schema.columns.get(column) + return None diff --git a/src/giql/transformer.py b/src/giql/transformer.py new file mode 100644 index 0000000..2d9705f --- /dev/null +++ b/src/giql/transformer.py @@ -0,0 +1,582 @@ +"""Query transformers for GIQL operations. + +This module contains transformers that rewrite queries containing GIQL-specific +operations (like CLUSTER and MERGE) into equivalent SQL with CTEs. +""" + +from sqlglot import exp + +from giql.constants import DEFAULT_CHROM_COL +from giql.constants import DEFAULT_END_COL +from giql.constants import DEFAULT_START_COL +from giql.constants import DEFAULT_STRAND_COL +from giql.expressions import GIQLCluster +from giql.expressions import GIQLMerge +from giql.schema import SchemaInfo + + +class ClusterTransformer: + """Transforms queries containing CLUSTER into CTE-based queries. + + CLUSTER cannot be a simple window function because it requires nested + window functions (LAG inside SUM). Instead, we transform: + + SELECT *, CLUSTER(interval) AS cluster_id FROM features + + Into: + + WITH lag_calc AS ( + SELECT *, LAG(end_pos) OVER (...) AS prev_end FROM features + ) + SELECT *, SUM(CASE WHEN prev_end >= start_pos ...) AS cluster_id + FROM lag_calc + """ + + def __init__(self, schema_info: SchemaInfo): + """Initialize transformer. + + :param schema_info: + Schema information for column mapping + """ + self.schema_info = schema_info + + def _get_table_name(self, query: exp.Select) -> str | None: + """Extract table name from query's FROM clause. + + :param query: + Query to extract table name from + :return: + Table name if FROM contains a simple table, None otherwise + """ + from_clause = query.args.get("from_") + if not from_clause: + return None + + if isinstance(from_clause.this, exp.Table): + return from_clause.this.name + + return None + + def _get_genomic_columns(self, query: exp.Select) -> tuple[str, str, str, str]: + """Get genomic column names from schema info or defaults. + + :param query: + Query to extract table and column info from + :return: + Tuple of (chrom_col, start_col, end_col, strand_col) + """ + table_name = self._get_table_name(query) + + # Default column names + chrom_col = DEFAULT_CHROM_COL + start_col = DEFAULT_START_COL + end_col = DEFAULT_END_COL + strand_col = DEFAULT_STRAND_COL + + if table_name: + table_schema = self.schema_info.get_table(table_name) + if table_schema: + # Find the genomic column + for col_info in table_schema.columns.values(): + if col_info.is_genomic: + if col_info.chrom_col: + chrom_col = col_info.chrom_col + if col_info.start_col: + start_col = col_info.start_col + if col_info.end_col: + end_col = col_info.end_col + if col_info.strand_col: + strand_col = col_info.strand_col + break + + return chrom_col, start_col, end_col, strand_col + + def transform(self, query: exp.Expression) -> exp.Expression: + """Transform query if it contains CLUSTER expressions. + + :param query: + Parsed query AST + :return: + Transformed query AST + """ + if not isinstance(query, exp.Select): + return query + + # First, recursively transform any CTEs that might contain CLUSTER + if query.args.get("with_"): + cte = query.args["with_"] + for cte_expr in cte.expressions: + if isinstance(cte_expr, exp.CTE): + # Transform the CTE's subquery + cte_expr.set("this", self.transform(cte_expr.this)) + + # Recursively transform subqueries in FROM clause + if query.args.get("from_"): + from_clause = query.args["from_"] + self._transform_subqueries_in_node(from_clause) + + # Recursively transform subqueries in JOIN clauses + if query.args.get("joins"): + for join in query.args["joins"]: + self._transform_subqueries_in_node(join) + + # Recursively transform subqueries in WHERE clause + if query.args.get("where"): + self._transform_subqueries_in_node(query.args["where"]) + + # Find all CLUSTER expressions in the SELECT clause + cluster_exprs = self._find_cluster_expressions(query) + + if not cluster_exprs: + return query + + # Transform query for each CLUSTER expression + for cluster_expr in cluster_exprs: + query = self._transform_for_cluster(query, cluster_expr) + + return query + + def _transform_subqueries_in_node(self, node: exp.Expression): + """Recursively transform subqueries within an expression node. + + :param node: + Expression node to search for subqueries + """ + # Find and transform any Subquery nodes + for subquery in node.find_all(exp.Subquery): + if isinstance(subquery.this, exp.Select): + transformed = self.transform(subquery.this) + subquery.set("this", transformed) + + def _find_cluster_expressions(self, query: exp.Select) -> list[GIQLCluster]: + """Find all CLUSTER expressions in query. + + :param query: + Query to search + :return: + List of CLUSTER expressions + """ + cluster_exprs = [] + + for expression in query.expressions: + # Check if this is a CLUSTER expression or an alias containing one + if isinstance(expression, GIQLCluster): + cluster_exprs.append(expression) + elif isinstance(expression, exp.Alias): + if isinstance(expression.this, GIQLCluster): + cluster_exprs.append(expression.this) + + return cluster_exprs + + def _transform_for_cluster( + self, query: exp.Select, cluster_expr: GIQLCluster + ) -> exp.Select: + """Transform query to compute CLUSTER using CTEs. + + :param query: + Original query + :param cluster_expr: + CLUSTER expression to transform + :return: + Transformed query with CTEs + """ + # Extract CLUSTER parameters + distance_expr = cluster_expr.args.get("distance") + + # Handle distance parameter - could be int literal or None + if distance_expr: + if isinstance(distance_expr, exp.Literal): + distance = int(distance_expr.this) + else: + # Try to extract as string and convert + try: + distance = int(str(distance_expr.this)) + except (ValueError, AttributeError): + distance = 0 + else: + distance = 0 + + stranded_expr = cluster_expr.args.get("stranded") + if stranded_expr: + # Handle different types of boolean expressions + if isinstance(stranded_expr, exp.Boolean): + stranded = stranded_expr.this + elif isinstance(stranded_expr, exp.Literal): + stranded = str(stranded_expr.this).upper() == "TRUE" + else: + # Try to extract the value as a string + stranded = str(stranded_expr).upper() in ("TRUE", "1", "YES") + else: + stranded = False + + # Get column names from schema_info or use defaults + chrom_col, start_col, end_col, strand_col = self._get_genomic_columns(query) + + # Build partition clause + partition_cols = [exp.column(chrom_col, quoted=True)] + if stranded: + partition_cols.append(exp.column(strand_col, quoted=True)) + + # Build ORDER BY for window + order_by = [exp.Ordered(this=exp.column(start_col, quoted=True))] + + # Create LAG window spec + lag_window = exp.Window( + this=exp.Anonymous( + this="LAG", expressions=[exp.column(end_col, quoted=True)] + ), + partition_by=partition_cols, + order=exp.Order(expressions=order_by), + ) + + # Add distance offset if specified + if distance > 0: + lag_with_distance = exp.Add( + this=lag_window, expression=exp.Literal.number(distance) + ) + else: + lag_with_distance = lag_window + + # Create CASE expression for is_new_cluster + case_expr = exp.Case( + ifs=[ + exp.If( + this=exp.GTE( + this=lag_with_distance, + expression=exp.column(start_col, quoted=True), + ), + true=exp.Literal.number(0), + ) + ], + default=exp.Literal.number(1), + ) + + # Build CTE SELECT expressions (all original except CLUSTER, plus is_new_cluster) + cte_expressions = [] + for expression in query.expressions: + # Skip CLUSTER expressions + if isinstance(expression, GIQLCluster): + continue + elif isinstance(expression, exp.Alias) and isinstance( + expression.this, GIQLCluster + ): + continue + else: + cte_expressions.append(expression) + + # Ensure required columns for window functions are included + required_cols = {chrom_col, start_col, end_col} + if stranded: + required_cols.add(strand_col) + + # Check if required columns are already in the select list + selected_cols = set() + for expr in cte_expressions: + if isinstance(expr, exp.Column): + selected_cols.add(expr.name) + elif isinstance(expr, exp.Alias): + # Don't count aliases as the source column + pass + elif isinstance(expr, exp.Star): + # SELECT * includes all columns + selected_cols = required_cols # Assume all are covered + break + + # Add missing required columns + for col in required_cols - selected_cols: + cte_expressions.append(exp.column(col, quoted=True)) + + # Add is_new_cluster calculation + cte_expressions.append(exp.alias_(case_expr, "is_new_cluster", quoted=False)) + + # Build CTE query + cte_select = exp.Select() + cte_select.select(*cte_expressions, copy=False) + + # Copy FROM, WHERE, GROUP BY, HAVING from original (but not ORDER BY) + # Use copy() to avoid sharing references between queries + if query.args.get("from_"): + from_clause = query.args["from_"].copy() + cte_select.set("from_", from_clause) + if query.args.get("where"): + cte_select.set("where", query.args["where"].copy()) + if query.args.get("group"): + cte_select.set("group", query.args["group"].copy()) + if query.args.get("having"): + cte_select.set("having", query.args["having"].copy()) + + # Create outer query with SUM over is_new_cluster + sum_window = exp.Window( + this=exp.Sum(this=exp.column("is_new_cluster")), + partition_by=partition_cols, + order=exp.Order(expressions=order_by), + ) + + # Build outer SELECT expressions (replace CLUSTER with SUM) + new_expressions = [] + for expression in query.expressions: + if isinstance(expression, GIQLCluster): + new_expressions.append(sum_window) + elif isinstance(expression, exp.Alias) and isinstance( + expression.this, GIQLCluster + ): + # Keep the alias but replace the expression + new_expressions.append( + exp.alias_(sum_window, expression.alias, quoted=False) + ) + else: + new_expressions.append(expression) + + # Build new query + new_query = exp.Select() + new_query.select(*new_expressions, copy=False) + + # Wrap CTE in subquery and set as FROM clause + subquery = exp.Subquery( + this=cte_select, + alias=exp.TableAlias(this=exp.Identifier(this="lag_calc")), + ) + new_query.from_(subquery, copy=False) + + # Copy ORDER BY from original to outer query + if query.args.get("order"): + new_query.order_by(*query.args["order"].expressions, copy=False) + + return new_query + + +class MergeTransformer: + """Transforms queries containing MERGE into GROUP BY queries. + + MERGE combines overlapping intervals using CLUSTER + aggregation: + + SELECT MERGE(interval) FROM features + + Into: + + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS __giql_cluster_id FROM features + ) + SELECT + chromosome, + MIN(start_pos) AS start_pos, + MAX(end_pos) AS end_pos + FROM clustered + GROUP BY chromosome, __giql_cluster_id + ORDER BY chromosome, start_pos + """ + + def __init__(self, schema_info: SchemaInfo): + """Initialize transformer. + + :param schema_info: + Schema information for column mapping + """ + self.schema_info = schema_info + self.cluster_transformer = ClusterTransformer(schema_info) + + def transform(self, query: exp.Expression) -> exp.Expression: + """Transform query if it contains MERGE expressions. + + :param query: + Parsed query AST + :return: + Transformed query AST + """ + if not isinstance(query, exp.Select): + return query + + # First, recursively transform any CTEs that might contain MERGE + if query.args.get("with_"): + cte = query.args["with_"] + for cte_expr in cte.expressions: + if isinstance(cte_expr, exp.CTE): + # Transform the CTE's subquery + cte_expr.set("this", self.transform(cte_expr.this)) + + # Recursively transform subqueries in FROM clause + if query.args.get("from_"): + from_clause = query.args["from_"] + self._transform_subqueries_in_node(from_clause) + + # Recursively transform subqueries in JOIN clauses + if query.args.get("joins"): + for join in query.args["joins"]: + self._transform_subqueries_in_node(join) + + # Recursively transform subqueries in WHERE clause + if query.args.get("where"): + self._transform_subqueries_in_node(query.args["where"]) + + # Find all MERGE expressions in the SELECT clause + merge_exprs = self._find_merge_expressions(query) + + if not merge_exprs: + return query + + # For now, support only one MERGE expression + if len(merge_exprs) > 1: + raise ValueError("Multiple MERGE expressions not yet supported") + + merge_expr = merge_exprs[0] + return self._transform_for_merge(query, merge_expr) + + def _transform_subqueries_in_node(self, node: exp.Expression): + """Recursively transform subqueries within an expression node. + + :param node: + Expression node to search for subqueries + """ + # Find and transform any Subquery nodes + for subquery in node.find_all(exp.Subquery): + if isinstance(subquery.this, exp.Select): + transformed = self.transform(subquery.this) + subquery.set("this", transformed) + + def _find_merge_expressions(self, query: exp.Select) -> list[GIQLMerge]: + """Find all MERGE expressions in query. + + :param query: + Query to search + :return: + List of MERGE expressions + """ + merge_exprs = [] + + for expression in query.expressions: + if isinstance(expression, GIQLMerge): + merge_exprs.append(expression) + elif isinstance(expression, exp.Alias): + if isinstance(expression.this, GIQLMerge): + merge_exprs.append(expression.this) + + return merge_exprs + + def _transform_for_merge( + self, query: exp.Select, merge_expr: GIQLMerge + ) -> exp.Select: + """Transform query to compute MERGE using CLUSTER + GROUP BY. + + :param query: + Original query + :param merge_expr: + MERGE expression to transform + :return: + Transformed query with clustering and aggregation + """ + # Extract MERGE parameters (same as CLUSTER) + distance_expr = merge_expr.args.get("distance") + stranded_expr = merge_expr.args.get("stranded") + + # Get column names from schema_info or use defaults + ( + chrom_col, + start_col, + end_col, + strand_col, + ) = self.cluster_transformer._get_genomic_columns(query) + + # Build CLUSTER expression with same parameters + cluster_kwargs = {"this": merge_expr.this} + if distance_expr: + cluster_kwargs["distance"] = distance_expr + if stranded_expr: + cluster_kwargs["stranded"] = stranded_expr + + cluster_expr = GIQLCluster(**cluster_kwargs) + + # Create intermediate query with CLUSTER + # Start with original query's FROM/WHERE/etc + cluster_query = exp.Select() + cluster_query.select(exp.Star(), copy=False) + cluster_query.select( + exp.alias_(cluster_expr, "__giql_cluster_id", quoted=False), + append=True, + copy=False, + ) + + # Copy FROM, WHERE from original + # Use copy() to avoid sharing references between queries + if query.args.get("from_"): + cluster_query.set("from_", query.args["from_"].copy()) + if query.args.get("where"): + cluster_query.set("where", query.args["where"].copy()) + + # Apply CLUSTER transformation to get the CTE-based query + cluster_query = self.cluster_transformer.transform(cluster_query) + + # Build GROUP BY columns + group_by_cols = [exp.column(chrom_col)] + + # Handle stranded parameter + if stranded_expr: + if isinstance(stranded_expr, exp.Boolean): + stranded = stranded_expr.this + elif isinstance(stranded_expr, exp.Literal): + stranded = str(stranded_expr.this).upper() == "TRUE" + else: + stranded = str(stranded_expr).upper() in ("TRUE", "1", "YES") + else: + stranded = False + + if stranded: + group_by_cols.append(exp.column(strand_col, quoted=True)) + + group_by_cols.append(exp.column("__giql_cluster_id")) + + # Build SELECT expressions for merged intervals + select_exprs = [] + + # Add group-by columns (non-aggregated) + select_exprs.append(exp.column(chrom_col, quoted=True)) + if stranded: + select_exprs.append(exp.column(strand_col, quoted=True)) + + # Add merged interval bounds + select_exprs.append( + exp.alias_( + exp.Min(this=exp.column(start_col, quoted=True)), start_col, quoted=False + ) + ) + select_exprs.append( + exp.alias_( + exp.Max(this=exp.column(end_col, quoted=True)), end_col, quoted=False + ) + ) + + # Process other columns from original SELECT + for expression in query.expressions: + # Skip the MERGE expression itself + if isinstance(expression, GIQLMerge): + continue + elif isinstance(expression, exp.Alias) and isinstance( + expression.this, GIQLMerge + ): + continue + # Include other columns (they should be aggregates or in GROUP BY) + else: + select_exprs.append(expression) + + # Build final query + final_query = exp.Select() + final_query.select(*select_exprs, copy=False) + + # FROM the clustered subquery + subquery = exp.Subquery( + this=cluster_query, + alias=exp.TableAlias(this=exp.Identifier(this="clustered")), + ) + final_query.from_(subquery, copy=False) + + # Add GROUP BY + final_query.group_by(*group_by_cols, copy=False) + + # Add ORDER BY (chromosome, start) + final_query.order_by( + exp.Ordered(this=exp.column(chrom_col, quoted=True)), copy=False + ) + final_query.order_by( + exp.Ordered(this=exp.column(start_col, quoted=True)), append=True, copy=False + ) + + return final_query diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..36b4f05 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,180 @@ +""" +Pytest fixtures for integration tests. +""" + +import pandas as pd +import pytest + +from giql import GIQLEngine + + +@pytest.fixture(scope="session") +def to_df(): + """Fixture providing a helper to convert cursors to DataFrames. + + Returns a function that materializes cursor results for testing. + Session-scoped since it's a pure function with no state. + + Usage: + result = to_df(engine.execute("SELECT ...")) + """ + + def _to_df(cursor): + if cursor.description: + columns = [desc[0] for desc in cursor.description] + return pd.DataFrame(cursor.fetchall(), columns=columns) + return pd.DataFrame() + + return _to_df + + +@pytest.fixture +def sample_variants_csv(tmp_path): + """Create sample variants CSV.""" + csv_content = """ + id,chromosome,start_pos,end_pos,ref,alt,quality + 1,chr1,1500,1600,A,T,30.0 + 2,chr1,10500,10600,G,C,40.0 + 3,chr1,15000,15100,T,A,25.0 + 4,chr2,500,600,C,G,35.0 + 5,chr2,5500,5600,A,T,20.0 + 6,chr1,25000,25100,G,A,35.0 + 7,chr2,15000,15100,T,C,28.0 + 8,chr3,1000,1100,A,G,32.0 + """ + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content.strip()) + return str(csv_path) + + +@pytest.fixture +def sample_genes_csv(tmp_path): + """Create sample genes CSV.""" + csv_content = """ + gene_id,name,chromosome,start_pos,end_pos,strand + 1,GENE1,chr1,1000,2000,+ + 2,GENE2,chr1,10000,11000,- + 3,GENE3,chr1,14000,16000,+ + 4,GENE4,chr2,400,700,+ + 5,GENE5,chr2,5000,6000,- + """ + csv_path = tmp_path / "genes.csv" + csv_path.write_text(csv_content.strip()) + return str(csv_path) + + +@pytest.fixture(params=["duckdb", "sqlite"]) +def engine_with_variants(request, sample_variants_csv): + """Create engine with loaded variants data for different dialects.""" + dialect = request.param + + engine = GIQLEngine(target_dialect=dialect, verbose=False) + engine.load_csv("variants", sample_variants_csv) + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "ref": "VARCHAR", + "alt": "VARCHAR", + "quality": "FLOAT", + }, + genomic_column="interval", + ) + + yield engine + engine.close() + + +@pytest.fixture +def duckdb_engine_with_data(sample_variants_csv, sample_genes_csv): + """DuckDB engine with both variants and genes loaded.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=False) + engine.load_csv("variants", sample_variants_csv) + engine.load_csv("genes", sample_genes_csv) + + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "ref": "VARCHAR", + "alt": "VARCHAR", + "quality": "FLOAT", + }, + genomic_column="interval", + ) + + engine.register_table_schema( + "genes", + { + "gene_id": "INTEGER", + "name": "VARCHAR", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "strand": "VARCHAR", + }, + genomic_column="interval", + ) + + yield engine + engine.close() + + +@pytest.fixture +def sample_peaks_csv(tmp_path): + """Create sample ChIP-seq peaks CSV for NEAREST testing.""" + csv_content = """ + peak_id,chromosome,start_pos,end_pos,signal + 1,chr1,5000,5200,100.5 + 2,chr1,12000,12100,85.2 + 3,chr1,20000,20500,120.8 + 4,chr2,3000,3100,95.3 + 5,chr2,8000,8200,110.7 + """ + csv_path = tmp_path / "peaks.csv" + csv_path.write_text(csv_content.strip()) + return str(csv_path) + + +@pytest.fixture +def engine_with_peaks_and_genes(request, sample_peaks_csv, sample_genes_csv): + """Create engine with peaks and genes loaded for NEAREST testing.""" + dialect = request.param if hasattr(request, "param") else "duckdb" + + engine = GIQLEngine(target_dialect=dialect, verbose=False) + engine.load_csv("peaks", sample_peaks_csv) + engine.load_csv("genes", sample_genes_csv) + + engine.register_table_schema( + "peaks", + { + "peak_id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "signal": "FLOAT", + }, + genomic_column="interval", + ) + + engine.register_table_schema( + "genes", + { + "gene_id": "INTEGER", + "name": "VARCHAR", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "strand": "VARCHAR", + }, + genomic_column="interval", + ) + + yield engine + engine.close() diff --git a/tests/integration/bedtools/__init__.py b/tests/integration/bedtools/__init__.py new file mode 100644 index 0000000..0a2c30b --- /dev/null +++ b/tests/integration/bedtools/__init__.py @@ -0,0 +1,5 @@ +"""Bedtools integration tests for GIQL. + +This package contains integration tests that validate GIQL query results +against bedtools command outputs using simulated genomic datasets. +""" diff --git a/tests/integration/bedtools/conftest.py b/tests/integration/bedtools/conftest.py new file mode 100644 index 0000000..af9387f --- /dev/null +++ b/tests/integration/bedtools/conftest.py @@ -0,0 +1,46 @@ +"""Pytest fixtures for bedtools integration tests. + +This module provides shared fixtures for: +- DuckDB connections +- Interval generators +""" + +import pytest + +from .utils.data_models import IntervalGeneratorConfig +from .utils.interval_generator import IntervalGenerator + + +@pytest.fixture(scope="function") +def duckdb_connection(): + """Provide clean DuckDB connection for each test. + + Yields: + DuckDB connection to in-memory database + + Note: + Each test gets a fresh database with no shared state. + Connection is automatically closed after test. + """ + try: + import duckdb + except ImportError: + pytest.skip("DuckDB not installed. Install with: pip install duckdb") + + conn = duckdb.connect(":memory:") + yield conn + conn.close() + + +@pytest.fixture(scope="function") +def interval_generator(): + """Provide configured interval generator. + + Returns: + IntervalGenerator with deterministic seed + + Note: + Uses seed=42 for reproducible test data. + """ + config = IntervalGeneratorConfig(seed=42) + return IntervalGenerator(config) diff --git a/tests/integration/bedtools/test_intersect.py b/tests/integration/bedtools/test_intersect.py new file mode 100644 index 0000000..cfc4394 --- /dev/null +++ b/tests/integration/bedtools/test_intersect.py @@ -0,0 +1,313 @@ +"""Integration tests for GIQL INTERSECTS operator. + +These tests validate that GIQL's INTERSECTS operator produces identical +results to bedtools intersect command. +""" + +from giql import GIQLEngine + +from .utils.bed_export import load_intervals +from .utils.bedtools_wrapper import intersect +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval + + +def _setup_giql_engine(duckdb_connection): + """Helper to set up GIQL engine with table schemas.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=False) + engine.conn = duckdb_connection + + schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "BIGINT", + "strand": "VARCHAR", + } + + engine.register_table_schema("intervals_a", schema, genomic_column="interval") + engine.register_table_schema("intervals_b", schema, genomic_column="interval") + + return engine + + +def test_intersect_basic_overlap(duckdb_connection, interval_generator): + """Test INTERSECTS predicate finds overlapping intervals. + + Given: + Two tables with genomic intervals where some intervals overlap + When: + A GIQL query uses INTERSECTS predicate in WHERE clause + Then: + Results match bedtools intersect output exactly + """ + # Arrange: Create overlapping intervals + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 150, 250, "a2", 200, "+"), + GenomicInterval("chr1", 300, 400, "a3", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 180, 220, "b1", 100, "+"), + GenomicInterval("chr1", 350, 450, "b2", 200, "-"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_partial_overlap(duckdb_connection, interval_generator): + """Test INTERSECTS with partially overlapping intervals. + + Given: + Intervals with partial overlaps + When: + INTERSECTS query is executed + Then: + Results match bedtools partial overlap behavior + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 250, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 200, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 200, 350, "b1", 150, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_no_overlap(duckdb_connection, interval_generator): + """Test INTERSECTS with non-overlapping intervals. + + Given: + Two sets of intervals with no overlaps + When: + INTERSECTS query is executed + Then: + No results returned (matches bedtools empty output) + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 300, 400, "b1", 150, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_adjacent_intervals(duckdb_connection, interval_generator): + """Test INTERSECTS with adjacent (touching) intervals. + + Given: + Intervals that touch but don't overlap + When: + INTERSECTS query is executed + Then: + No results returned (adjacent != overlapping) + """ + # Arrange: Adjacent intervals (end of a1 == start of b1) + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 200, 300, "b1", 150, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_multiple_chromosomes(duckdb_connection, interval_generator): + """Test INTERSECTS across multiple chromosomes. + + Given: + Intervals on different chromosomes + When: + INTERSECTS query is executed + Then: + Only same-chromosome overlaps are returned + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr2", 150, 250, "a2", 200, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 150, "+"), + GenomicInterval("chr2", 200, 300, "b2", 100, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) diff --git a/tests/integration/bedtools/test_merge.py b/tests/integration/bedtools/test_merge.py new file mode 100644 index 0000000..51fea31 --- /dev/null +++ b/tests/integration/bedtools/test_merge.py @@ -0,0 +1,224 @@ +"""Integration tests for GIQL MERGE operator. + +These tests validate that GIQL's MERGE operator produces identical +results to bedtools merge command. +""" + +from giql import GIQLEngine + +from .utils.bed_export import load_intervals +from .utils.bedtools_wrapper import merge +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval + + +def _setup_giql_engine(duckdb_connection): + """Helper to set up GIQL engine with table schema.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=False) + engine.conn = duckdb_connection + + schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "BIGINT", + "strand": "VARCHAR", + } + + engine.register_table_schema( + "intervals", + schema, + genomic_column="interval", + ) + + return engine + + +def test_merge_adjacent_intervals(duckdb_connection): + """Test MERGE with adjacent intervals. + + Given: + A set of adjacent intervals + When: + MERGE operator is applied + Then: + Adjacent intervals are merged into single intervals + """ + # Arrange + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 200, 300, "i2", 150, "+"), + GenomicInterval("chr1", 300, 400, "i3", 200, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = merge( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT MERGE(interval) + FROM intervals + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_merge_overlapping_intervals(duckdb_connection): + """Test MERGE with overlapping intervals. + + Given: + A set of overlapping intervals + When: + MERGE operator is applied + Then: + Overlapping intervals are merged + """ + # Arrange + intervals = [ + GenomicInterval("chr1", 100, 250, "i1", 100, "+"), + GenomicInterval("chr1", 200, 350, "i2", 150, "+"), + GenomicInterval("chr1", 300, 400, "i3", 200, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = merge( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT MERGE(interval) + FROM intervals + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_merge_separated_intervals(duckdb_connection): + """Test MERGE with separated intervals. + + Given: + Intervals with gaps between them + When: + MERGE operator is applied + Then: + Separated intervals remain separate + """ + # Arrange + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 300, 400, "i2", 150, "+"), + GenomicInterval("chr1", 500, 600, "i3", 200, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = merge( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT MERGE(interval) + FROM intervals + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_merge_multiple_chromosomes(duckdb_connection): + """Test MERGE across multiple chromosomes. + + Given: + Intervals on different chromosomes + When: + MERGE operator is applied + Then: + Merging occurs per chromosome + """ + # Arrange + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 180, 300, "i2", 150, "+"), + GenomicInterval("chr2", 100, 200, "i3", 100, "+"), + GenomicInterval("chr2", 180, 300, "i4", 150, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = merge( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT MERGE(interval) + FROM intervals + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) diff --git a/tests/integration/bedtools/test_nearest.py b/tests/integration/bedtools/test_nearest.py new file mode 100644 index 0000000..a185993 --- /dev/null +++ b/tests/integration/bedtools/test_nearest.py @@ -0,0 +1,267 @@ +"""Integration tests for GIQL NEAREST operator. + +These tests validate that GIQL's NEAREST operator produces identical +results to bedtools closest command. +""" + +from giql import GIQLEngine + +from .utils.bed_export import load_intervals +from .utils.bedtools_wrapper import closest +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval + + +def _setup_giql_engine(duckdb_connection): + """Helper to set up GIQL engine with table schemas.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=False) + engine.conn = duckdb_connection + + schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "BIGINT", + "strand": "VARCHAR", + } + + engine.register_table_schema( + "intervals_a", + schema, + genomic_column="interval", + interval_type="closed", # Match bedtools distance calculation + ) + engine.register_table_schema( + "intervals_b", + schema, + genomic_column="interval", + interval_type="closed", # Match bedtools distance calculation + ) + + return engine + + +def test_nearest_non_overlapping(duckdb_connection): + """Test NEAREST with non-overlapping intervals. + + Given: + Two sets of non-overlapping intervals + When: + NEAREST operator is applied + Then: + Each interval in A finds its closest neighbor in B + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 500, 600, "a2", 150, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), + GenomicInterval("chr1", 350, 400, "b2", 150, "+"), + GenomicInterval("chr1", 700, 800, "b3", 200, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_nearest_multiple_candidates(duckdb_connection): + """Test NEAREST with equidistant intervals. + + Given: + Interval in A with multiple equidistant intervals in B + When: + NEAREST operator is applied + Then: + Bedtools reports one of the equidistant intervals (tie-breaking behavior) + """ + # Arrange: a1 is equidistant from b1 and b2 + intervals_a = [ + GenomicInterval("chr1", 300, 400, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Distance: 100 bp + GenomicInterval("chr1", 500, 600, "b2", 150, "+"), # Distance: 100 bp + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results (allowing tie-breaking differences) + assert len(giql_result) == len(bedtools_result) + # The nearest interval is either b1 or b2 (both equidistant) + assert giql_result[0][3] == "a1" # Interval A name + assert giql_result[0][9] in ("b1", "b2") # Nearest could be either + + +def test_nearest_cross_chromosome(duckdb_connection): + """Test NEAREST across multiple chromosomes. + + Given: + Intervals on different chromosomes + When: + NEAREST operator is applied + Then: + Each interval finds nearest only on same chromosome + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr2", 100, 200, "a2", 150, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 300, 400, "b1", 100, "+"), + GenomicInterval("chr2", 300, 400, "b2", 150, "+"), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_nearest_boundary_cases(duckdb_connection): + """Test NEAREST with boundary cases. + + Given: + Adjacent intervals (touching but not overlapping) + When: + NEAREST operator is applied + Then: + Adjacent intervals are reported as nearest (distance = 0) + """ + # Arrange: a1 ends where b1 starts (adjacent, distance = 0) + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 200, 300, "b1", 150, "+"), # Adjacent to a1 + GenomicInterval("chr1", 500, 600, "b2", 200, "+"), # Far away + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) diff --git a/tests/integration/bedtools/test_strand_aware.py b/tests/integration/bedtools/test_strand_aware.py new file mode 100644 index 0000000..11075c6 --- /dev/null +++ b/tests/integration/bedtools/test_strand_aware.py @@ -0,0 +1,471 @@ +"""Integration tests for GIQL strand-aware operations. + +These tests validate that GIQL correctly handles strand-specific interval +operations, matching bedtools behavior with -s and -S flags. +""" + +from giql import GIQLEngine + +from .utils.bed_export import load_intervals +from .utils.bedtools_wrapper import closest +from .utils.bedtools_wrapper import intersect +from .utils.bedtools_wrapper import merge +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval + + +def _setup_giql_engine(duckdb_connection): + """Helper to set up GIQL engine with table schemas.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=False) + engine.conn = duckdb_connection + + schema = { + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + "score": "BIGINT", + "strand": "VARCHAR", + } + + for table_name in ["intervals_a", "intervals_b", "intervals"]: + engine.register_table_schema( + table_name, + schema, + genomic_column="interval", + interval_type="closed", # Match bedtools distance calculation + ) + + return engine + + +def test_intersect_same_strand(duckdb_connection): + """Test INTERSECTS with same-strand requirement. + + Given: + Intervals on both same and opposite strands + When: + INTERSECTS with same-strand requirement is applied + Then: + Only same-strand overlaps are reported + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Overlaps a1 (same +) + GenomicInterval("chr1", 350, 450, "b2", 150, "-"), # Overlaps a2 (same -) + GenomicInterval("chr1", 150, 250, "b3", 200, "-"), # Overlaps a1 (opposite) + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools with same-strand requirement + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + strand_mode="same", + ) + + # Act: Execute GIQL query with same-strand filter + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_opposite_strand(duckdb_connection): + """Test INTERSECTS with opposite-strand requirement. + + Given: + Intervals on both same and opposite strands + When: + INTERSECTS with opposite-strand requirement is applied + Then: + Only opposite-strand overlaps are reported + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "-"), # Overlaps a1 (opposite) + GenomicInterval("chr1", 350, 450, "b2", 150, "+"), # Overlaps a2 (opposite) + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools with opposite-strand requirement + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + strand_mode="opposite", + ) + + # Act: Execute GIQL query with opposite-strand filter + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_ignore_strand(duckdb_connection): + """Test INTERSECTS ignoring strand information. + + Given: + Intervals with various strand combinations + When: + INTERSECTS without strand requirements is applied + Then: + All overlaps are reported regardless of strand + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Same strand + GenomicInterval("chr1", 150, 250, "b2", 150, "-"), # Opposite strand + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools without strand requirements + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query without strand filter + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_intersect_mixed_strands(duckdb_connection): + """Test INTERSECTS with mixed strand scenarios. + + Given: + Complex scenario with +, -, and unstranded intervals + When: + INTERSECTS with same-strand requirement is applied + Then: + Results correctly handle strand matching logic + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + GenomicInterval("chr1", 500, 600, "a3", 200, "."), # Unstranded + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), + GenomicInterval("chr1", 350, 450, "b2", 150, "-"), + GenomicInterval("chr1", 550, 650, "b3", 200, "."), + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools with same-strand requirement + bedtools_result = intersect( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + strand_mode="same", + ) + + # Act: Execute GIQL query with same-strand filter + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + AND a.strand != '.' + AND b.strand != '.' + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_nearest_same_strand(duckdb_connection): + """Test NEAREST with same-strand requirement. + + Given: + Intervals with candidates on same and opposite strands + When: + NEAREST with same-strand requirement is applied + Then: + Only same-strand nearest intervals are reported + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), # Nearest on same strand + GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer but opposite + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools with same-strand requirement + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + strand_mode="same", + ) + + # Act: Execute GIQL query with same-strand NEAREST + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1, stranded=true) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_nearest_opposite_strand(duckdb_connection): + """Test NEAREST with opposite-strand requirement. + + Given: + Intervals with candidates on same and opposite strands + When: + NEAREST with opposite-strand requirement is applied + Then: + Only opposite-strand nearest intervals are reported + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "-"), # Nearest opposite strand + GenomicInterval("chr1", 220, 240, "b2", 150, "+"), # Closer but same strand + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools with opposite-strand requirement + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + strand_mode="opposite", + ) + + # Note: GIQL may not have direct opposite-strand support + # This test documents the expected behavior + assert len(bedtools_result) == 1 + assert bedtools_result[0][3] == "a1" + assert bedtools_result[0][9] == "b1" + + +def test_nearest_ignore_strand(duckdb_connection): + """Test NEAREST ignoring strand information. + + Given: + Intervals on different strands + When: + NEAREST without strand requirements is applied + Then: + Closest interval is found regardless of strand + """ + # Arrange + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), + GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals_a", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute bedtools operation using pybedtools without strand requirements + bedtools_result = closest( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], + ) + + # Act: Execute GIQL query without strand filter + engine = _setup_giql_engine(duckdb_connection) + giql_query = """ + SELECT a.*, b.* + FROM intervals_a a, NEAREST(intervals_b, k=1) b + ORDER BY a.chromosome, a.start_pos + """ + sql = engine.transpile(giql_query) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Assert: Compare GIQL and bedtools results + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, ( + f"GIQL results don't match bedtools:\n" + f"Differences: {comparison.differences}\n" + f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" + ) + + +def test_merge_strand_specific(duckdb_connection): + """Test MERGE with strand-specific behavior. + + Given: + Overlapping intervals on different strands + When: + MERGE with strand-specific flag is applied + Then: + Intervals are merged per-strand (same-strand intervals merge together) + """ + # Arrange - overlapping intervals on both strands + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), # Overlaps i1 (same +) + GenomicInterval("chr1", 120, 180, "i3", 200, "-"), # Overlaps i1 (opposite) + GenomicInterval("chr1", 160, 240, "i4", 100, "-"), # Overlaps i2 (opposite) + ] + + # Load into DuckDB + load_intervals( + duckdb_connection, + "intervals", + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + ) + + # Act: Execute bedtools operation using pybedtools with strand-specific merging + bedtools_result = merge( + [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], + strand_mode="same", + ) + + # Note: GIQL MERGE with strand grouping would require GROUP BY strand + # This test documents the expected behavior + assert len(bedtools_result) >= 2 # At least one per strand diff --git a/tests/integration/bedtools/utils/__init__.py b/tests/integration/bedtools/utils/__init__.py new file mode 100644 index 0000000..99a414e --- /dev/null +++ b/tests/integration/bedtools/utils/__init__.py @@ -0,0 +1 @@ +"""Utilities for bedtools integration testing.""" diff --git a/tests/integration/bedtools/utils/bed_export.py b/tests/integration/bedtools/utils/bed_export.py new file mode 100644 index 0000000..cd5a5c8 --- /dev/null +++ b/tests/integration/bedtools/utils/bed_export.py @@ -0,0 +1,40 @@ +"""DuckDB loading utilities for genomic intervals. + +This module provides functions for loading genomic intervals into DuckDB tables. +""" + +from typing import List +from typing import Tuple + + +def load_intervals( + conn, + table_name: str, + intervals: List[Tuple[str, int, int, str | None, int | None, str | None]], +): + """Load intervals into DuckDB table. + + Args: + conn: DuckDB connection + table_name: Name of table to create + intervals: List of (chrom, start, end, name, score, strand) tuples + where name, score, and strand can be None + + Note: + Creates a new table with GIQL's default column names for genomic data: + chromosome, start_pos, end_pos, name, score, strand + """ + # Create table with GIQL's default column names + conn.execute(f""" + CREATE TABLE {table_name} ( + chromosome VARCHAR, + start_pos INTEGER, + end_pos INTEGER, + name VARCHAR, + score INTEGER, + strand VARCHAR + ) + """) + + # Insert intervals + conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", intervals) diff --git a/tests/integration/bedtools/utils/bedtools_wrapper.py b/tests/integration/bedtools/utils/bedtools_wrapper.py new file mode 100644 index 0000000..699185b --- /dev/null +++ b/tests/integration/bedtools/utils/bedtools_wrapper.py @@ -0,0 +1,293 @@ +"""Pybedtools wrapper for genomic interval operations. + +This module provides functions for: +- Creating BedTool objects from interval data +- Executing bedtools operations via pybedtools +- Converting results to comparable formats +""" + +from typing import List +from typing import Tuple + +import pybedtools + +# Strand flag constants for bedtools commands +STRAND_SAME = True # Require same strand (pybedtools uses True for -s) +# Require opposite strands (pybedtools uses "opposite" for -S) +STRAND_OPPOSITE = "opposite" + + +class BedtoolsError(Exception): + """Raised when bedtools operation fails.""" + + pass + + +def create_bedtool(intervals: List[Tuple]) -> pybedtools.BedTool: + """Create BedTool object from interval tuples. + + Args: + intervals: List of tuples, each containing: + - (chrom, start, end) for BED3 format + - (chrom, start, end, name, score, strand) for BED6 format + + Returns: + pybedtools.BedTool object + + Example: + >>> intervals = [("chr1", 100, 200, "a1", 100, "+")] + >>> bt = create_bedtool(intervals) + """ + # Convert tuples to BED format strings + bed_strings = [] + for interval in intervals: + if len(interval) == 3: + # BED3 format + bed_strings.append(f"{interval[0]}\t{interval[1]}\t{interval[2]}") + elif len(interval) >= 6: + # BED6 format + chrom, start, end, name, score, strand = interval[:6] + # Handle None values + name = name if name is not None else "." + score = score if score is not None else 0 + strand = strand if strand is not None else "." + bed_strings.append(f"{chrom}\t{start}\t{end}\t{name}\t{score}\t{strand}") + else: + raise ValueError(f"Invalid interval format: {interval}") + + bed_string = "\n".join(bed_strings) + return pybedtools.BedTool(bed_string, from_string=True) + + +def intersect( + intervals_a: List[Tuple], + intervals_b: List[Tuple], + strand_mode: str | None = None, +) -> List[Tuple]: + """Find overlapping intervals using bedtools intersect. + + Args: + intervals_a: First set of intervals + intervals_b: Second set of intervals + strand_mode: Strand requirement ('same', 'opposite', or None for ignore) + + Returns: + List of tuples matching intervals_a format + + Example: + >>> a = [("chr1", 100, 200, "a1", 100, "+")] + >>> b = [("chr1", 150, 250, "b1", 100, "+")] + >>> result = intersect(a, b) + """ + try: + bt_a = create_bedtool(intervals_a) + bt_b = create_bedtool(intervals_b) + + # Build kwargs for intersect + # Use -u (unique) to return each interval from A only once + # This matches GIQL's DISTINCT behavior + kwargs = {"u": True} + if strand_mode == "same": + kwargs["s"] = True + elif strand_mode == "opposite": + kwargs["S"] = True + + # Perform intersection + result = bt_a.intersect(bt_b, **kwargs) + + # Convert to tuples + return bedtool_to_tuples(result) + + except Exception as e: + raise BedtoolsError(f"Intersect operation failed: {e}") + + +def merge(intervals: List[Tuple], strand_mode: str | None = None) -> List[Tuple]: + """Merge overlapping intervals using bedtools merge. + + Args: + intervals: List of intervals to merge + strand_mode: Strand requirement ('same' to merge per-strand, None for ignore) + + Returns: + List of tuples in BED3 format (chrom, start, end) + + Example: + >>> intervals = [ + ... ("chr1", 100, 200, "a1", 100, "+"), + ... ("chr1", 180, 300, "a2", 100, "+"), + ... ] + >>> result = merge(intervals) + >>> # Returns: [("chr1", 100, 300)] + """ + try: + bt = create_bedtool(intervals) + + # Sort before merging (required by bedtools merge) + bt_sorted = bt.sort() + + # Build kwargs for merge + kwargs = {} + if strand_mode == "same": + kwargs["s"] = True + + # Perform merge + result = bt_sorted.merge(**kwargs) + + # Convert to tuples (merge returns BED3 format) + return bedtool_to_tuples(result, format="bed3") + + except Exception as e: + raise BedtoolsError(f"Merge operation failed: {e}") + + +def closest( + intervals_a: List[Tuple], + intervals_b: List[Tuple], + strand_mode: str | None = None, + k: int = 1, +) -> List[Tuple]: + """Find closest intervals using bedtools closest. + + Args: + intervals_a: Query intervals + intervals_b: Database intervals to search + strand_mode: Strand requirement ('same', 'opposite', or None for ignore) + k: Number of closest intervals to report (default: 1) + + Returns: + List of tuples with format: (a_fields..., b_fields..., distance) + + Example: + >>> a = [("chr1", 100, 200, "a1", 100, "+")] + >>> b = [("chr1", 300, 400, "b1", 100, "+")] + >>> result = closest(a, b) + >>> # Returns intervals from a and b with distance + """ + try: + bt_a = create_bedtool(intervals_a) + bt_b = create_bedtool(intervals_b) + + # Sort inputs (required for -t flag) + bt_a = bt_a.sort() + bt_b = bt_b.sort() + + # Build kwargs for closest + kwargs = {"d": True, "t": "first"} # Report distance, break ties by taking first + if k > 1: + kwargs["k"] = k + if strand_mode == "same": + kwargs["s"] = True + elif strand_mode == "opposite": + kwargs["S"] = True + + # Perform closest + result = bt_a.closest(bt_b, **kwargs) + + # Convert to tuples (closest returns concatenated fields + distance) + return bedtool_to_tuples(result, format="closest") + + except Exception as e: + raise BedtoolsError(f"Closest operation failed: {e}") + + +def bedtool_to_tuples(bedtool: pybedtools.BedTool, format: str = "bed6") -> List[Tuple]: + """Convert BedTool object to list of tuples. + + Args: + bedtool: pybedtools.BedTool object + format: Expected format ('bed3', 'bed6', or 'closest') + + Returns: + List of tuples matching the format + + Note: + - bed3: (chrom, start, end) + - bed6: (chrom, start, end, name, score, strand) + - closest: (chrom_a, start_a, end_a, name_a, score_a, strand_a, + chrom_b, start_b, end_b, name_b, score_b, strand_b, distance) + """ + rows = [] + + for interval in bedtool: + fields = interval.fields + + if format == "bed3": + chrom = fields[0] + start = int(fields[1]) + end = int(fields[2]) + rows.append((chrom, start, end)) + + elif format == "bed6": + if len(fields) < 6: + # Pad with defaults if needed + while len(fields) < 6: + if len(fields) == 3: + fields.append(".") # name + elif len(fields) == 4: + fields.append("0") # score + elif len(fields) == 5: + fields.append(".") # strand + + chrom = fields[0] + start = int(fields[1]) + end = int(fields[2]) + name = fields[3] if fields[3] != "." else None + score = int(fields[4]) if fields[4] != "." else None + strand = fields[5] if fields[5] != "." else None + + rows.append((chrom, start, end, name, score, strand)) + + elif format == "closest": + # Closest returns: a_fields + b_fields + distance + # For BED6: 6 fields for a, 6 fields for b, 1 distance = 13 total + if len(fields) >= 13: + # Parse all fields as-is, converting appropriate ones to int + row = [] + for i, field in enumerate(fields): + # Positions (1, 2, 7, 8) and distance (12) should be int + if i in (1, 2, 7, 8, 12): + row.append(int(field)) + # Scores (4, 10) should be int if not "." + elif i in (4, 10): + row.append(int(field) if field != "." else None) + # Names (3, 9) and strands (5, 11) should be None if "." + elif i in (3, 5, 9, 11): + row.append(field if field != "." else None) + else: + row.append(field) + rows.append(tuple(row)) + else: + raise ValueError( + f"Unexpected number of fields for closest: {len(fields)}" + ) + + else: + raise ValueError(f"Unsupported format: {format}") + + return rows + + +def add_strand_flag(kwargs: dict, strand_mode: str | None) -> dict: + """Add strand flag to bedtools kwargs. + + Args: + kwargs: Base kwargs dictionary + strand_mode: Strand requirement ('same', 'opposite', or None for ignore) + + Returns: + Updated kwargs dictionary with strand flag + + Example: + >>> kwargs = add_strand_flag({}, "same") + >>> # Returns: {"s": True} + """ + updated_kwargs = kwargs.copy() + + if strand_mode == "same": + updated_kwargs["s"] = True + elif strand_mode == "opposite": + updated_kwargs["S"] = True + # None or other values = ignore strand (no flag added) + + return updated_kwargs diff --git a/tests/integration/bedtools/utils/comparison.py b/tests/integration/bedtools/utils/comparison.py new file mode 100644 index 0000000..caa4bd2 --- /dev/null +++ b/tests/integration/bedtools/utils/comparison.py @@ -0,0 +1,134 @@ +"""Result comparison logic for GIQL vs bedtools outputs. + +This module provides functions for: +- Comparing GIQL and bedtools results with appropriate tolerance +- Order-independent row sorting +- Epsilon-based float comparison +""" + +from typing import Any +from typing import List +from typing import Tuple + +from .data_models import ComparisonResult + + +def _sort_key(row: Tuple) -> Tuple: + """Generate sort key for order-independent comparison. + + Args: + row: Result row tuple + + Returns: + Sortable tuple (handles None values) + """ + # Convert None to empty string for sorting + return tuple("" if v is None else v for v in row) + + +def _values_match(val1: Any, val2: Any, epsilon: float = 1e-9) -> bool: + """Compare two values with appropriate tolerance. + + Args: + val1: First value + val2: Second value + epsilon: Tolerance for floating-point comparisons + + Returns: + True if values match within tolerance + """ + # Handle None values + if val1 is None and val2 is None: + return True + if val1 is None or val2 is None: + return False + + # Float comparison with epsilon + if isinstance(val1, float) or isinstance(val2, float): + try: + return abs(float(val1) - float(val2)) <= epsilon + except (ValueError, TypeError): + return False + + # Exact match for other types + return val1 == val2 + + +def compare_results( + giql_rows: List[Tuple], bedtools_rows: List[Tuple], epsilon: float = 1e-9 +) -> ComparisonResult: + """Compare GIQL and bedtools results with appropriate tolerance. + + Comparison rules: + - Integer positions/counts: exact match required + - Floating-point values: epsilon tolerance + - Row ordering: order-independent (sorts both result sets) + + Args: + giql_rows: Rows from GIQL query execution + bedtools_rows: Rows from bedtools output + epsilon: Tolerance for floating-point comparisons + + Returns: + ComparisonResult with match status and differences + """ + giql_count = len(giql_rows) + bedtools_count = len(bedtools_rows) + + # Sort both result sets for order-independent comparison + giql_sorted = sorted(giql_rows, key=_sort_key) + bedtools_sorted = sorted(bedtools_rows, key=_sort_key) + + differences = [] + + # Check row counts + if giql_count != bedtools_count: + differences.append( + f"Row count mismatch: GIQL has {giql_count} rows, " + f"bedtools has {bedtools_count} rows" + ) + + # Compare rows + max_rows = max(giql_count, bedtools_count) + for i in range(max_rows): + # Check if row exists in both + if i >= giql_count: + differences.append( + f"Row {i}: Missing in GIQL, present in bedtools: {bedtools_sorted[i]}" + ) + continue + if i >= bedtools_count: + differences.append( + f"Row {i}: Present in GIQL, missing in bedtools: {giql_sorted[i]}" + ) + continue + + giql_row = giql_sorted[i] + bedtools_row = bedtools_sorted[i] + + # Check column counts + if len(giql_row) != len(bedtools_row): + differences.append( + f"Row {i}: Column count mismatch " + f"(GIQL: {len(giql_row)} cols, bedtools: {len(bedtools_row)} cols)" + ) + continue + + # Compare each column + for col_idx, (giql_val, bedtools_val) in enumerate(zip(giql_row, bedtools_row)): + if not _values_match(giql_val, bedtools_val, epsilon): + differences.append( + f"Row {i}, col {col_idx}: " + f"GIQL={giql_val!r} != bedtools={bedtools_val!r}" + ) + + # Determine match status + match = len(differences) == 0 + + return ComparisonResult( + match=match, + giql_row_count=giql_count, + bedtools_row_count=bedtools_count, + differences=differences, + comparison_metadata={"epsilon": epsilon, "sorted": True}, + ) diff --git a/tests/integration/bedtools/utils/data_models.py b/tests/integration/bedtools/utils/data_models.py new file mode 100644 index 0000000..dad0832 --- /dev/null +++ b/tests/integration/bedtools/utils/data_models.py @@ -0,0 +1,259 @@ +"""Data models for bedtools integration testing. + +This module defines the core data structures used throughout the test suite: +- GenomicInterval: Represents a single genomic interval +- SimulatedDataset: Collection of intervals for testing +- ComparisonResult: Result of comparing GIQL vs bedtools outputs +- IntervalGeneratorConfig: Configuration for dataset generation +- BedtoolsVersion: Bedtools version information +""" + +import re +from dataclasses import dataclass +from dataclasses import field +from pathlib import Path +from typing import Dict +from typing import List + + +@dataclass +class GenomicInterval: + """Represents a single genomic interval with all BED file fields. + + Attributes: + chrom: Chromosome name (e.g., "chr1", "chr2", "chrX") + start: Start position (0-based, inclusive) + end: End position (0-based, exclusive) + name: Optional interval name/identifier + score: Optional score value (0-1000) + strand: Optional strand ("+", "-", or ".") + """ + + chrom: str + start: int + end: int + name: str | None = None + score: int | None = None + strand: str | None = None + + def __post_init__(self): + """Validate interval fields.""" + if self.start >= self.end: + raise ValueError( + f"Invalid interval: start ({self.start}) >= end ({self.end})" + ) + if self.start < 0: + raise ValueError(f"Invalid interval: start ({self.start}) < 0") + if self.strand and self.strand not in ["+", "-", "."]: + raise ValueError(f"Invalid strand: {self.strand}") + if self.score is not None and not (0 <= self.score <= 1000): + raise ValueError(f"Invalid score: {self.score}") + + def to_bed_line(self, format="bed6") -> str: + """Convert to BED format line. + + Args: + format: Output format ('bed3' or 'bed6') + + Returns: + Tab-separated BED format string + """ + if format == "bed3": + return f"{self.chrom}\t{self.start}\t{self.end}" + elif format == "bed6": + name = self.name or "." + score = self.score if self.score is not None else 0 + strand = self.strand or "." + return f"{self.chrom}\t{self.start}\t{self.end}\t{name}\t{score}\t{strand}" + else: + raise ValueError(f"Unsupported BED format: {format}") + + +@dataclass +class SimulatedDataset: + """Collection of genomic intervals with controlled properties for testing. + + Attributes: + name: Dataset identifier (e.g., "intervals_a", "intervals_b") + intervals: List of genomic intervals + scenario_type: Scenario descriptor (e.g., "overlapping", "adjacent") + metadata: Generation parameters (seed, chromosome_count, etc.) + """ + + name: str + intervals: List[GenomicInterval] + scenario_type: str + metadata: dict = field(default_factory=dict) + + def __post_init__(self): + """Validate dataset has at least one interval.""" + if len(self.intervals) == 0: + raise ValueError("Dataset must contain at least one interval") + + def to_bed_file(self, path: Path, format="bed6"): + """Export to BED file. + + Args: + path: Output file path + format: BED format ('bed3' or 'bed6') + """ + with open(path, "w") as f: + for interval in self.intervals: + f.write(interval.to_bed_line(format) + "\n") + + def to_duckdb_table(self, conn, table_name: str): + """Load into DuckDB table. + + Args: + conn: DuckDB connection + table_name: Name of table to create + """ + rows = [ + (i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in self.intervals + ] + conn.execute(f""" + CREATE TABLE {table_name} ( + chrom VARCHAR, + start INTEGER, + end INTEGER, + name VARCHAR, + score INTEGER, + strand VARCHAR + ) + """) + conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", rows) + + +@dataclass +class ComparisonResult: + """Result of comparing GIQL and bedtools outputs. + + Attributes: + match: Whether results match + giql_row_count: Number of rows from GIQL query + bedtools_row_count: Number of rows from bedtools output + differences: Specific differences found (if match=False) + comparison_metadata: Epsilon used, sort order, etc. + """ + + match: bool + giql_row_count: int + bedtools_row_count: int + differences: List[str] = field(default_factory=list) + comparison_metadata: dict = field(default_factory=dict) + + def __bool__(self) -> bool: + """Allow direct boolean evaluation in assertions.""" + return self.match + + def failure_message(self) -> str: + """Generate detailed failure message for test output. + + Returns: + Formatted failure message with differences + """ + if self.match: + return "✓ Results match" + + msg = [ + f"✗ Results do not match", + f" GIQL rows: {self.giql_row_count}", + f" Bedtools rows: {self.bedtools_row_count}", + ] + + if self.differences: + msg.append(" Differences:") + for diff in self.differences[:10]: # Limit to first 10 + msg.append(f" - {diff}") + if len(self.differences) > 10: + msg.append(f" ... and {len(self.differences) - 10} more") + + return "\n".join(msg) + + +@dataclass +class IntervalGeneratorConfig: + """Configuration for simulated dataset generation. + + Attributes: + chromosome_count: Number of chromosomes to generate + intervals_per_chromosome: Intervals per chromosome + min_interval_size: Minimum interval length + max_interval_size: Maximum interval length + overlap_probability: Probability of overlap (0.0-1.0) + strand_distribution: Proportions of +/-/. strands + seed: Random seed for reproducibility + """ + + chromosome_count: int = 3 + intervals_per_chromosome: int = 100 + min_interval_size: int = 100 + max_interval_size: int = 1000 + overlap_probability: float = 0.3 + strand_distribution: dict = field( + default_factory=lambda: {"+": 0.45, "-": 0.45, ".": 0.1} + ) + seed: int = 42 + + def __post_init__(self): + """Validate configuration parameters.""" + if self.chromosome_count <= 0: + raise ValueError("chromosome_count must be > 0") + if self.intervals_per_chromosome <= 0: + raise ValueError("intervals_per_chromosome must be > 0") + if self.min_interval_size < 1: + raise ValueError("min_interval_size must be >= 1") + if self.max_interval_size < self.min_interval_size: + raise ValueError("max_interval_size must be >= min_interval_size") + if not (0.0 <= self.overlap_probability <= 1.0): + raise ValueError("overlap_probability must be in [0.0, 1.0]") + if abs(sum(self.strand_distribution.values()) - 1.0) > 1e-6: + raise ValueError("strand_distribution must sum to 1.0") + + +@dataclass +class BedtoolsVersion: + """Represents bedtools version information. + + Attributes: + major: Major version number + minor: Minor version number + patch: Patch version number + raw_version_string: Original version string from bedtools + """ + + major: int + minor: int + patch: int + raw_version_string: str + + def is_compatible(self) -> bool: + """Check if version meets minimum requirement (2.30.0). + + Returns: + True if version >= 2.30.0 + """ + return (self.major, self.minor, self.patch) >= (2, 30, 0) + + def __str__(self) -> str: + """Return version as string.""" + return f"{self.major}.{self.minor}.{self.patch}" + + @classmethod + def from_string(cls, version_str: str) -> "BedtoolsVersion": + """Parse version from bedtools --version output. + + Args: + version_str: Version string from bedtools (e.g., "bedtools v2.30.0") + + Returns: + BedtoolsVersion instance + + Raises: + ValueError: If version string cannot be parsed + """ + match = re.search(r"v?(\d+)\.(\d+)\.(\d+)", version_str) + if not match: + raise ValueError(f"Could not parse version from: {version_str}") + major, minor, patch = map(int, match.groups()) + return cls(major, minor, patch, version_str) diff --git a/tests/integration/bedtools/utils/interval_generator.py b/tests/integration/bedtools/utils/interval_generator.py new file mode 100644 index 0000000..05df214 --- /dev/null +++ b/tests/integration/bedtools/utils/interval_generator.py @@ -0,0 +1,425 @@ +"""Interval generator for creating simulated genomic datasets. + +This module provides the IntervalGenerator class for creating test datasets +with controlled properties (overlap density, strand distribution, etc.). +""" + +import random +from typing import List +from typing import Tuple + +from .data_models import GenomicInterval +from .data_models import IntervalGeneratorConfig +from .data_models import SimulatedDataset + + +class IntervalGenerator: + """Generate simulated genomic intervals for testing. + + Provides methods for generating intervals with various patterns: + - Overlapping intervals + - Adjacent intervals + - Separated intervals + - Multi-chromosome datasets + - Strand-specific datasets + """ + + def __init__(self, config: IntervalGeneratorConfig | None = None): + """Initialize interval generator. + + Args: + config: Generator configuration (uses defaults if None) + """ + self.config = config or IntervalGeneratorConfig() + self.rng = random.Random(self.config.seed) + + def _choose_strand(self) -> str: + """Choose strand based on configured distribution. + + Returns: + Strand ('+', '-', or '.') + """ + r = self.rng.random() + cumulative = 0.0 + for strand, prob in self.config.strand_distribution.items(): + cumulative += prob + if r <= cumulative: + return strand + return "." # Fallback + + def _generate_interval_size(self) -> int: + """Generate random interval size within configured range. + + Returns: + Interval size in base pairs + """ + return self.rng.randint( + self.config.min_interval_size, self.config.max_interval_size + ) + + def generate_basic( + self, chromosome: str, count: int, max_position: int = 1000000 + ) -> List[GenomicInterval]: + """Generate basic random intervals. + + Args: + chromosome: Chromosome name + count: Number of intervals to generate + max_position: Maximum chromosome position + + Returns: + List of genomic intervals + """ + intervals = [] + for i in range(count): + size = self._generate_interval_size() + start = self.rng.randint(0, max_position - size) + end = start + size + strand = self._choose_strand() + + intervals.append( + GenomicInterval( + chrom=chromosome, + start=start, + end=end, + name=f"interval_{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + return intervals + + def generate_dataset( + self, + name: str, + scenario_type: str = "basic", + chromosome_count: int | None = None, + intervals_per_chrom: int | None = None, + ) -> SimulatedDataset: + """Generate a complete simulated dataset. + + Args: + name: Dataset identifier + scenario_type: Type of scenario ("basic", "overlapping", etc.) + chromosome_count: Number of chromosomes (uses config default if None) + intervals_per_chrom: Intervals per chromosome (uses config default if None) + + Returns: + SimulatedDataset with generated intervals + """ + chrom_count = chromosome_count or self.config.chromosome_count + interval_count = intervals_per_chrom or self.config.intervals_per_chromosome + + all_intervals = [] + for i in range(chrom_count): + chrom_name = f"chr{i + 1}" + intervals = self.generate_basic(chrom_name, interval_count) + all_intervals.extend(intervals) + + return SimulatedDataset( + name=name, + intervals=all_intervals, + scenario_type=scenario_type, + metadata={ + "chromosome_count": chrom_count, + "intervals_per_chromosome": interval_count, + "seed": self.config.seed, + "total_intervals": len(all_intervals), + }, + ) + + def generate_overlapping_scenarios( + self, chromosome: str, count: int, overlap_size: int = 50 + ) -> List[GenomicInterval]: + """Generate overlapping intervals with controlled overlap. + + Args: + chromosome: Chromosome name + count: Number of intervals to generate + overlap_size: Size of overlap between adjacent intervals + + Returns: + List of overlapping genomic intervals + """ + intervals = [] + base_size = self.config.min_interval_size + current_start = 100 + + for i in range(count): + start = current_start + end = start + base_size + strand = self._choose_strand() + + intervals.append( + GenomicInterval( + chrom=chromosome, + start=start, + end=end, + name=f"overlap_{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + # Next interval starts before current ends (creating overlap) + current_start = end - overlap_size + + return intervals + + def generate_adjacent_scenarios( + self, chromosome: str, count: int + ) -> List[GenomicInterval]: + """Generate adjacent intervals (touching but not overlapping). + + Args: + chromosome: Chromosome name + count: Number of intervals to generate + + Returns: + List of adjacent genomic intervals + """ + intervals = [] + base_size = self.config.min_interval_size + current_start = 100 + + for i in range(count): + start = current_start + end = start + base_size + strand = self._choose_strand() + + intervals.append( + GenomicInterval( + chrom=chromosome, + start=start, + end=end, + name=f"adjacent_{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + # Next interval starts exactly where current ends + current_start = end + + return intervals + + def generate_separated_scenarios( + self, chromosome: str, count: int, gap_size: int = 100 + ) -> List[GenomicInterval]: + """Generate separated intervals with gaps between them. + + Args: + chromosome: Chromosome name + count: Number of intervals to generate + gap_size: Size of gap between intervals + + Returns: + List of separated genomic intervals + """ + intervals = [] + base_size = self.config.min_interval_size + current_start = 100 + + for i in range(count): + start = current_start + end = start + base_size + strand = self._choose_strand() + + intervals.append( + GenomicInterval( + chrom=chromosome, + start=start, + end=end, + name=f"separated_{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + # Next interval starts after a gap + current_start = end + gap_size + + return intervals + + def generate_multi_chromosome_scenarios( + self, + chromosome_count: int, + intervals_per_chrom: int, + scenario_func: str = "basic", + ) -> List[GenomicInterval]: + """Generate intervals across multiple chromosomes. + + Args: + chromosome_count: Number of chromosomes + intervals_per_chrom: Number of intervals per chromosome + scenario_func: Scenario type ("basic", "overlapping", "adjacent", + "separated") + + Returns: + List of genomic intervals across multiple chromosomes + """ + all_intervals = [] + + for i in range(chromosome_count): + chrom_name = f"chr{i + 1}" + + if scenario_func == "overlapping": + intervals = self.generate_overlapping_scenarios( + chrom_name, intervals_per_chrom + ) + elif scenario_func == "adjacent": + intervals = self.generate_adjacent_scenarios( + chrom_name, intervals_per_chrom + ) + elif scenario_func == "separated": + intervals = self.generate_separated_scenarios( + chrom_name, intervals_per_chrom + ) + else: # basic + intervals = self.generate_basic(chrom_name, intervals_per_chrom) + + all_intervals.extend(intervals) + + return all_intervals + + def generate_same_strand_pairs( + self, chromosome: str, pair_count: int, strand: str = "+" + ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]: + """Generate two sets of intervals on the same strand. + + Args: + chromosome: Chromosome name + pair_count: Number of interval pairs to generate + strand: Strand to use for all intervals ('+' or '-') + + Returns: + Tuple of (intervals_a, intervals_b) on same strand + """ + intervals_a = [] + intervals_b = [] + base_size = self.config.min_interval_size + current_start = 100 + + for i in range(pair_count): + # Interval A + start_a = current_start + end_a = start_a + base_size + intervals_a.append( + GenomicInterval( + chrom=chromosome, + start=start_a, + end=end_a, + name=f"a{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + # Interval B - overlaps A, same strand + start_b = start_a + (base_size // 2) + end_b = start_b + base_size + intervals_b.append( + GenomicInterval( + chrom=chromosome, + start=start_b, + end=end_b, + name=f"b{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + # Move to next region + current_start = end_b + 100 + + return intervals_a, intervals_b + + def generate_opposite_strand_pairs( + self, chromosome: str, pair_count: int + ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]: + """Generate two sets of intervals on opposite strands. + + Args: + chromosome: Chromosome name + pair_count: Number of interval pairs to generate + + Returns: + Tuple of (intervals_a, intervals_b) on opposite strands + """ + intervals_a = [] + intervals_b = [] + base_size = self.config.min_interval_size + current_start = 100 + + for i in range(pair_count): + # Interval A on + strand + start_a = current_start + end_a = start_a + base_size + intervals_a.append( + GenomicInterval( + chrom=chromosome, + start=start_a, + end=end_a, + name=f"a{i}", + score=self.rng.randint(0, 1000), + strand="+", + ) + ) + + # Interval B - overlaps A, opposite strand (-) + start_b = start_a + (base_size // 2) + end_b = start_b + base_size + intervals_b.append( + GenomicInterval( + chrom=chromosome, + start=start_b, + end=end_b, + name=f"b{i}", + score=self.rng.randint(0, 1000), + strand="-", + ) + ) + + # Move to next region + current_start = end_b + 100 + + return intervals_a, intervals_b + + def generate_mixed_strand_intervals( + self, chromosome: str, count: int + ) -> List[GenomicInterval]: + """Generate intervals with mixed strand assignments. + + Args: + chromosome: Chromosome name + count: Number of intervals to generate + + Returns: + List of intervals with randomly assigned strands (+, -, .) + """ + intervals = [] + base_size = self.config.min_interval_size + strands = ["+", "-", "."] + current_start = 100 + + for i in range(count): + start = current_start + end = start + base_size + # Randomly choose strand from +, -, . + strand = self.rng.choice(strands) + + intervals.append( + GenomicInterval( + chrom=chromosome, + start=start, + end=end, + name=f"mixed_{i}", + score=self.rng.randint(0, 1000), + strand=strand, + ) + ) + + current_start = end + 50 # Small gap + + return intervals diff --git a/tests/test_cluster.py b/tests/test_cluster.py new file mode 100644 index 0000000..c359608 --- /dev/null +++ b/tests/test_cluster.py @@ -0,0 +1,441 @@ +"""Tests for CLUSTER and MERGE operations.""" + +import pytest + +from giql import GIQLEngine + + +@pytest.fixture +def cluster_test_data_csv(tmp_path): + """Create sample data for cluster testing.""" + csv_content = """ + id,chromosome,start_pos,end_pos,name + 1,chr1,100,200,f1 + 2,chr1,180,250,f2 + 3,chr1,250,500,f3 + 4,chr1,501,1000,f4 + 5,chr2,100,200,f5 + 6,chr2,300,400,f6 + """ + csv_path = tmp_path / "features.csv" + csv_path.write_text(csv_content.strip()) + return str(csv_path) + + +@pytest.fixture +def stranded_test_data_csv(tmp_path): + """Create stranded data for cluster testing.""" + csv_content = """ + id,chromosome,start_pos,end_pos,strand,name + 1,chr1,100,200,+,f1 + 2,chr1,180,250,+,f2 + 3,chr1,200,300,-,f3 + 4,chr1,250,350,-,f4 + 5,chr1,400,500,+,f5 + """ + csv_path = tmp_path / "stranded_features.csv" + csv_path.write_text(csv_content.strip()) + return str(csv_path) + + +@pytest.fixture +def duckdb_cluster_engine(cluster_test_data_csv): + """DuckDB engine with cluster test data loaded.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=True) + engine.load_csv("features", cluster_test_data_csv) + engine.register_table_schema( + "features", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + ) + yield engine + engine.close() + + +@pytest.fixture +def duckdb_stranded_engine(stranded_test_data_csv): + """DuckDB engine with stranded test data loaded.""" + engine = GIQLEngine(target_dialect="duckdb", verbose=True) + engine.load_csv("stranded_features", stranded_test_data_csv) + engine.register_table_schema( + "stranded_features", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "BIGINT", + "end_pos": "BIGINT", + "strand": "VARCHAR", + "name": "VARCHAR", + }, + genomic_column="interval", + strand_col="strand", + ) + yield engine + engine.close() + + +class TestCluster: + """Tests for CLUSTER window function.""" + + def test_basic_cluster(self, duckdb_cluster_engine, to_df): + """Test basic CLUSTER operation.""" + result = to_df( + duckdb_cluster_engine.execute(""" + SELECT + id, + chromosome, + start_pos, + end_pos, + name, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + ) + + # Expected clusters: + # chr1: features 1,2,3 are cluster 1 (overlapping/bookended) + # chr1: feature 4 is cluster 2 (gap at 501) + # chr2: feature 5 is cluster 1 + # chr2: feature 6 is cluster 2 (gap at 300) + + assert len(result) == 6 + + # Check chr1 clusters + chr1_results = result[result["chromosome"] == "chr1"] + assert chr1_results.iloc[0]["cluster_id"] == chr1_results.iloc[1]["cluster_id"] + assert chr1_results.iloc[1]["cluster_id"] == chr1_results.iloc[2]["cluster_id"] + assert chr1_results.iloc[2]["cluster_id"] != chr1_results.iloc[3]["cluster_id"] + + # Check chr2 clusters + chr2_results = result[result["chromosome"] == "chr2"] + assert chr2_results.iloc[0]["cluster_id"] != chr2_results.iloc[1]["cluster_id"] + + def test_cluster_with_distance(self, duckdb_cluster_engine, to_df): + """Test CLUSTER with distance parameter.""" + result = to_df( + duckdb_cluster_engine.execute(""" + SELECT + id, + chromosome, + start_pos, + end_pos, + name, + CLUSTER(interval, 100) AS cluster_id + FROM features + ORDER BY chromosome, start_pos + """) + ) + + # With distance=100, chr1 features 1,2,3,4 should all be in same cluster + # (gap of 1bp at position 501 is within 100bp tolerance) + chr1_results = result[result["chromosome"] == "chr1"] + cluster_ids = chr1_results["cluster_id"].tolist() + assert len(set(cluster_ids)) == 1 # All in same cluster + + def test_stranded_cluster(self, duckdb_stranded_engine, to_df): + """Test CLUSTER with stranded=true.""" + result = to_df( + duckdb_stranded_engine.execute(""" + SELECT + id, + chromosome, + start_pos, + end_pos, + strand, + name, + CLUSTER(interval, stranded=true) AS cluster_id + FROM stranded_features + ORDER BY chromosome, start_pos + """) + ) + + # Features should cluster only within the same strand: + # + strand: f1,f2 overlap -> cluster 1, f5 is separate -> cluster 2 + # - strand: f3,f4 overlap -> cluster 1 + # Note: cluster_id numbering restarts for each partition (strand) + + assert len(result) == 5 + + # Extract features + f1 = result[result["id"] == 1].iloc[0] + f2 = result[result["id"] == 2].iloc[0] + f3 = result[result["id"] == 3].iloc[0] + f4 = result[result["id"] == 4].iloc[0] + f5 = result[result["id"] == 5].iloc[0] + + # Check that f1 and f2 (both +, overlapping) have same cluster_id + assert f1["cluster_id"] == f2["cluster_id"] + assert f1["strand"] == "+" + assert f2["strand"] == "+" + + # Check that f3 and f4 (both -, overlapping) have same cluster_id + assert f3["cluster_id"] == f4["cluster_id"] + assert f3["strand"] == "-" + assert f4["strand"] == "-" + + # Check that f5 (+ strand, separated) has different cluster from f1/f2 + assert f5["cluster_id"] != f1["cluster_id"] + assert f5["strand"] == "+" + + # Verify stranded clustering works: compare with non-stranded + result_nonstranded = to_df( + duckdb_stranded_engine.execute(""" + SELECT + id, + CLUSTER(interval) AS cluster_id + FROM stranded_features + ORDER BY id + """) + ) + + # Without stranded, f1-f4 should all be in same cluster (overlapping) + ns_f1 = result_nonstranded[result_nonstranded["id"] == 1].iloc[0] + ns_f2 = result_nonstranded[result_nonstranded["id"] == 2].iloc[0] + ns_f3 = result_nonstranded[result_nonstranded["id"] == 3].iloc[0] + ns_f4 = result_nonstranded[result_nonstranded["id"] == 4].iloc[0] + + assert ns_f1["cluster_id"] == ns_f2["cluster_id"] + assert ns_f2["cluster_id"] == ns_f3["cluster_id"] + assert ns_f3["cluster_id"] == ns_f4["cluster_id"] + + def test_cluster_in_cte(self, duckdb_cluster_engine, to_df): + """Test CLUSTER operation inside a CTE.""" + result = to_df( + duckdb_cluster_engine.execute(""" + WITH clustered_features AS ( + SELECT + id, + chromosome, + start_pos, + end_pos, + name, + CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT * + FROM clustered_features + WHERE cluster_id = 1 + ORDER BY chromosome, start_pos + """) + ) + + # Should return features in cluster 1 from each chromosome + assert len(result) > 0 + assert all("cluster_id" in row for _, row in result.iterrows()) + + def test_cluster_in_cte_with_aggregation(self, duckdb_cluster_engine, to_df): + """Test CLUSTER in CTE with aggregation in outer query.""" + result = to_df( + duckdb_cluster_engine.execute(""" + WITH clustered_features AS ( + SELECT + chromosome, + start_pos, + end_pos, + CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + chromosome, + cluster_id, + COUNT(*) as interval_count, + MIN(start_pos) as min_start, + MAX(end_pos) as max_end + FROM clustered_features + GROUP BY chromosome, cluster_id + ORDER BY chromosome, cluster_id + """) + ) + + # chr1 should have 2 clusters, chr2 should have 2 clusters + assert len(result) == 4 + + chr1_results = result[result["chromosome"] == "chr1"] + assert len(chr1_results) == 2 + # First cluster should have 3 intervals (f1, f2, f3) + assert chr1_results.iloc[0]["interval_count"] == 3 + # Second cluster should have 1 interval (f4) + assert chr1_results.iloc[1]["interval_count"] == 1 + + +class TestMerge: + """Tests for MERGE aggregate function.""" + + def test_basic_merge(self, duckdb_cluster_engine, to_df): + """Test basic MERGE operation.""" + result = to_df( + duckdb_cluster_engine.execute(""" + SELECT MERGE(interval) + FROM features + """) + ) + + # Expected merged intervals: + # chr1: features 1,2,3 merge into [100, 500] + # chr1: feature 4 stays as [501, 1000] + # chr2: feature 5 stays as [100, 200] + # chr2: feature 6 stays as [300, 400] + + assert len(result) == 4 + + # Check chr1 merged intervals + chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") + assert len(chr1_results) == 2 + assert chr1_results.iloc[0]["start_pos"] == 100 + assert chr1_results.iloc[0]["end_pos"] == 500 + assert chr1_results.iloc[1]["start_pos"] == 501 + assert chr1_results.iloc[1]["end_pos"] == 1000 + + # Check chr2 stays separate + chr2_results = result[result["chromosome"] == "chr2"].sort_values("start_pos") + assert len(chr2_results) == 2 + assert chr2_results.iloc[0]["start_pos"] == 100 + assert chr2_results.iloc[0]["end_pos"] == 200 + assert chr2_results.iloc[1]["start_pos"] == 300 + assert chr2_results.iloc[1]["end_pos"] == 400 + + def test_merge_with_distance(self, duckdb_cluster_engine, to_df): + """Test MERGE with distance parameter.""" + result = to_df( + duckdb_cluster_engine.execute(""" + SELECT MERGE(interval, 100) + FROM features + """) + ) + + # With distance=100, chr1 features 1-4 should merge into one interval + chr1_results = result[result["chromosome"] == "chr1"] + assert len(chr1_results) == 1 + assert chr1_results.iloc[0]["start_pos"] == 100 + assert chr1_results.iloc[0]["end_pos"] == 1000 + + def test_merge_with_aggregation(self, duckdb_cluster_engine, to_df): + """Test MERGE with additional aggregation columns.""" + result = to_df( + duckdb_cluster_engine.execute(""" + SELECT MERGE(interval), COUNT(*) as feature_count + FROM features + """) + ) + + # chr1 should have 2 merged intervals with counts + chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") + assert len(chr1_results) == 2 + assert chr1_results.iloc[0]["feature_count"] == 3 # f1, f2, f3 merged + assert chr1_results.iloc[1]["feature_count"] == 1 # f4 alone + + def test_stranded_merge(self, duckdb_stranded_engine, to_df): + """Test MERGE with stranded=true.""" + result = to_df( + duckdb_stranded_engine.execute(""" + SELECT MERGE(interval, stranded=true) + FROM stranded_features + """) + ) + + # + strand: f1,f2 merge -> [100,250], f5 stays -> [400,500] + # - strand: f3,f4 merge -> [200,350] + assert len(result) == 3 + + plus_strand = result[result["strand"] == "+"].sort_values("start_pos") + assert len(plus_strand) == 2 + assert plus_strand.iloc[0]["start_pos"] == 100 + assert plus_strand.iloc[0]["end_pos"] == 250 + assert plus_strand.iloc[1]["start_pos"] == 400 + assert plus_strand.iloc[1]["end_pos"] == 500 + + minus_strand = result[result["strand"] == "-"] + assert len(minus_strand) == 1 + assert minus_strand.iloc[0]["start_pos"] == 200 + assert minus_strand.iloc[0]["end_pos"] == 350 + + def test_merge_in_cte(self, duckdb_cluster_engine, to_df): + """Test MERGE operation inside a CTE.""" + result = to_df( + duckdb_cluster_engine.execute(""" + WITH merged_intervals AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT * + FROM merged_intervals + ORDER BY chromosome, start_pos + """) + ) + + # Should have same results as basic merge + assert len(result) == 4 + + chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") + assert len(chr1_results) == 2 + assert chr1_results.iloc[0]["start_pos"] == 100 + assert chr1_results.iloc[0]["end_pos"] == 500 + + def test_merge_in_cte_with_aggregation_and_filter( + self, duckdb_cluster_engine, to_df + ): + """Test MERGE in CTE with aggregation and filtering in outer query.""" + result = to_df( + duckdb_cluster_engine.execute(""" + WITH merged_intervals AS ( + SELECT + MERGE(interval), + COUNT(*) as interval_count + FROM features + ) + SELECT * + FROM merged_intervals + WHERE interval_count > 1 + ORDER BY chromosome, start_pos + """) + ) + + # Only chr1's first merged interval has count > 1 (3 intervals merged) + assert len(result) == 1 + assert result.iloc[0]["chromosome"] == "chr1" + assert result.iloc[0]["start_pos"] == 100 + assert result.iloc[0]["end_pos"] == 500 + assert result.iloc[0]["interval_count"] == 3 + + def test_merge_in_cte_with_distance_and_aggregation( + self, duckdb_cluster_engine, to_df + ): + """Test MERGE with distance parameter in CTE with aggregation.""" + result = to_df( + duckdb_cluster_engine.execute(""" + WITH merged_intervals AS ( + SELECT + MERGE(interval, 100), + COUNT(*) as interval_count, + AVG(id) as avg_id + FROM features + ) + SELECT * + FROM merged_intervals + WHERE interval_count >= 2 + ORDER BY chromosome, start_pos + """) + ) + + # With distance=100, chr1 all 4 features merge, chr2 features also merge + # (gap between chr2 features is exactly 100bp) + assert len(result) == 2 + + # Check chr1 merged interval + chr1_result = result[result["chromosome"] == "chr1"].iloc[0] + assert chr1_result["interval_count"] == 4 + assert chr1_result["start_pos"] == 100 + assert chr1_result["end_pos"] == 1000 + + # Check chr2 merged interval + chr2_result = result[result["chromosome"] == "chr2"].iloc[0] + assert chr2_result["interval_count"] == 2 + assert chr2_result["start_pos"] == 100 + assert chr2_result["end_pos"] == 400 diff --git a/tests/test_distance_parsing.py b/tests/test_distance_parsing.py new file mode 100644 index 0000000..f4ca201 --- /dev/null +++ b/tests/test_distance_parsing.py @@ -0,0 +1,60 @@ +"""Parser tests for DISTANCE operator syntax. + +Tests verify that the GIQL parser correctly recognizes and parses +DISTANCE function calls with various argument patterns. +""" + +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.expressions import GIQLDistance + + +class TestDistanceParsing: + """Tests for parsing DISTANCE function syntax.""" + + def test_parse_distance_with_column_to_column(self): + """ + GIVEN a GIQL query with DISTANCE(a.interval, b.interval) + WHEN parsing the query + THEN should create GIQLDistance AST node with correct arguments + """ + sql = "SELECT DISTANCE(a.interval, b.interval) FROM features_a a, features_b b" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Find the DISTANCE expression in the SELECT clause + select_expr = ast.expressions[0] + assert isinstance(select_expr, GIQLDistance), ( + f"Expected GIQLDistance node, got {type(select_expr)}" + ) + + # Verify arguments are present + assert select_expr.this is not None, "Missing first argument (this)" + assert select_expr.args.get("expression") is not None, ( + "Missing second argument (expression)" + ) + + def test_parse_distance_with_literal_range(self): + """ + GIVEN a GIQL query with DISTANCE(a.interval, 'chr1:100-200') + WHEN parsing the query + THEN should create GIQLDistance node with column and literal range + """ + sql = "SELECT DISTANCE(a.interval, 'chr1:100-200') FROM features a" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Find the DISTANCE expression + select_expr = ast.expressions[0] + assert isinstance(select_expr, GIQLDistance), ( + f"Expected GIQLDistance node, got {type(select_expr)}" + ) + + # Verify both arguments present + assert select_expr.this is not None, "Missing first argument" + assert select_expr.args.get("expression") is not None, "Missing second argument" + + # Second argument should be a literal string + second_arg = select_expr.args["expression"] + assert "chr1" in str(second_arg).lower(), "Expected chromosome in literal range" diff --git a/tests/test_distance_transpilation.py b/tests/test_distance_transpilation.py new file mode 100644 index 0000000..7b1d79e --- /dev/null +++ b/tests/test_distance_transpilation.py @@ -0,0 +1,72 @@ +"""Transpilation tests for DISTANCE operator SQL generation. + +Tests verify that DISTANCE() is correctly transpiled to SQL CASE expressions +across different SQL dialects (DuckDB, SQLite, PostgreSQL). +""" + +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator +from giql.generators import GIQLDuckDBGenerator + + +class TestDistanceTranspilation: + """Tests for DISTANCE SQL generation across dialects.""" + + def test_distance_transpilation_duckdb(self): + """ + GIVEN a GIQL query with DISTANCE() + WHEN transpiling to DuckDB SQL + THEN should generate complete CASE expression with distance logic + """ + sql = """ + SELECT DISTANCE(a.interval, b.interval) as dist + FROM features_a a CROSS JOIN features_b b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = GIQLDuckDBGenerator() + output = generator.generate(ast) + + expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" + + assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" + + def test_distance_transpilation_sqlite(self): + """ + GIVEN a GIQL query with DISTANCE() + WHEN transpiling to SQLite SQL + THEN should generate complete compatible SQL CASE expression + """ + sql = """ + SELECT DISTANCE(a.interval, b.interval) as dist + FROM features_a a, features_b b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a, features_b AS b""" + + assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" + + def test_distance_transpilation_postgres(self): + """ + GIVEN a GIQL query with DISTANCE() + WHEN transpiling to PostgreSQL SQL + THEN should generate complete compatible SQL CASE expression + """ + sql = """ + SELECT DISTANCE(a.interval, b.interval) as dist + FROM features_a a CROSS JOIN features_b b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" + + assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" diff --git a/tests/test_distance_udf.py b/tests/test_distance_udf.py new file mode 100644 index 0000000..3048c33 --- /dev/null +++ b/tests/test_distance_udf.py @@ -0,0 +1,394 @@ +"""Unit tests for DISTANCE operator SQL generation and behavior. + +Tests verify the distance calculation logic by checking the transpiled +SQL output and executed results. +""" + +import duckdb +import pytest +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator + + +class TestDistanceCalculation: + """Unit tests for basic distance calculation logic.""" + + def test_overlapping_intervals_return_zero(self): + """ + GIVEN two overlapping genomic intervals + WHEN DISTANCE() is calculated between them + THEN the distance should be 0 + """ + # Create a test query with DISTANCE() + sql = """ + SELECT + DISTANCE(a.interval, b.interval) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos) b + """ + + # Parse and generate SQL + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + # Verify SQL contains CASE expression logic for overlaps + assert "CASE" in output_sql + assert "WHEN" in output_sql + + # Execute with DuckDB to verify behavior + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Overlapping intervals should return distance = 0 + assert result[0] == 0, ( + f"Expected distance 0 for overlapping intervals, got {result[0]}" + ) + + conn.close() + + def test_non_overlapping_intervals_return_positive_distance(self): + """ + GIVEN two non-overlapping genomic intervals with a gap + WHEN DISTANCE() is calculated between them + THEN the distance should be a positive integer (gap size) + """ + # Interval A: chr1:100-200 + # Interval B: chr1:300-400 + # Gap: 300 - 200 = 100 base pairs + sql = """ + SELECT + DISTANCE(a.interval, b.interval) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Gap distance should be 100 + assert result[0] == 100, f"Expected distance 100, got {result[0]}" + + conn.close() + + def test_different_chromosomes_return_null(self): + """ + GIVEN two intervals on different chromosomes + WHEN DISTANCE() is calculated between them + THEN the distance should be NULL + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + CROSS JOIN + (SELECT 'chr2' as chromosome, 150 as start_pos, 250 as end_pos) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Different chromosomes should return NULL + assert result[0] is None, ( + f"Expected NULL for different chromosomes, got {result[0]}" + ) + + conn.close() + + def test_adjacent_bookended_intervals_return_zero(self): + """ + GIVEN two adjacent intervals where end_a == start_b (bookended) + WHEN DISTANCE() is calculated between them + THEN the distance should be 0 (following bedtools convention) + """ + # Interval A: chr1:100-200 + # Interval B: chr1:200-300 (starts exactly where A ends) + sql = """ + SELECT + DISTANCE(a.interval, b.interval) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 200 as start_pos, 300 as end_pos) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Bookended intervals should return distance = 0 + assert result[0] == 0, ( + f"Expected distance 0 for bookended intervals, got {result[0]}" + ) + + conn.close() + + def test_zero_width_intervals_point_features(self): + """ + GIVEN a zero-width interval (point feature) and a regular interval + WHEN DISTANCE() is calculated + THEN the distance should be calculated correctly + """ + # Point feature at chr1:150 (start=150, end=150) + # Interval at chr1:300-400 + # Distance: 300 - 150 = 150 + sql = """ + SELECT + DISTANCE(a.interval, b.interval) as distance + FROM + (SELECT 'chr1' as chromosome, 150 as start_pos, 150 as end_pos) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Distance should be 150 + assert result[0] == 150, f"Expected distance 150, got {result[0]}" + + conn.close() + + +class TestStrandedDistance: + """Tests for stranded distance calculation.""" + + def test_stranded_same_strand_plus(self): + """ + GIVEN two intervals on the same chromosome and same '+' strand + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be calculated normally (positive value) + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Gap distance should be 100 (positive, since strand is '+') + assert result[0] == 100, f"Expected distance 100, got {result[0]}" + + conn.close() + + def test_stranded_same_strand_minus(self): + """ + GIVEN two intervals on the same chromosome and same '-' strand + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be negative (multiplied by -1) + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Gap distance should be -100 (negative, since first interval strand is '-') + assert result[0] == -100, f"Expected distance -100, got {result[0]}" + + conn.close() + + def test_stranded_different_strands_calculates_distance(self): + """ + GIVEN two intervals on different strands ('+' and '-') + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be calculated normally (positive, since first interval is '+') + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Different strands should still calculate distance, sign based on first interval + assert result[0] == 100, f"Expected distance 100, got {result[0]}" + + conn.close() + + def test_stranded_different_strands_minus_first(self): + """ + GIVEN two intervals on different strands ('-' first, then '+') + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be negative (based on first interval's strand) + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Distance should be negative since first interval is '-' + assert result[0] == -100, f"Expected distance -100, got {result[0]}" + + conn.close() + + def test_stranded_dot_strand_returns_null(self): + """ + GIVEN intervals with '.' strand (unspecified) + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be NULL + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '.' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '.' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # '.' strand should return NULL + assert result[0] is None, f"Expected NULL for '.' strand, got {result[0]}" + + conn.close() + + def test_stranded_question_mark_strand_returns_null(self): + """ + GIVEN intervals with '?' strand (unknown) + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be NULL + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '?' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # '?' strand should return NULL + assert result[0] is None, f"Expected NULL for '?' strand, got {result[0]}" + + conn.close() + + def test_stranded_null_strand_returns_null(self): + """ + GIVEN intervals with NULL strand + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be NULL + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, NULL as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # NULL strand should return NULL + assert result[0] is None, f"Expected NULL for NULL strand, got {result[0]}" + + conn.close() + + def test_stranded_overlapping_intervals_minus_strand(self): + """ + GIVEN two overlapping intervals on '-' strand + WHEN DISTANCE() is calculated with stranded=true + THEN the distance should be 0 (overlaps have distance 0 regardless of strand) + """ + sql = """ + SELECT + DISTANCE(a.interval, b.interval, stranded=true) as distance + FROM + (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + CROSS JOIN + (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos, '-' as strand) b + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator() + output_sql = generator.generate(ast) + + conn = duckdb.connect(":memory:") + result = conn.execute(output_sql).fetchone() + + # Overlapping intervals should return 0 + assert result[0] == 0, ( + f"Expected distance 0 for overlapping intervals, got {result[0]}" + ) + + conn.close() diff --git a/tests/test_engine.py b/tests/test_engine.py new file mode 100644 index 0000000..2cff3a1 --- /dev/null +++ b/tests/test_engine.py @@ -0,0 +1,480 @@ +import tempfile + +from hypothesis import given +from hypothesis import settings +from hypothesis import strategies as st + +from giql import GIQLEngine + + +class TestGIQLEngine: + def test_engine_initialization_duckdb(self): + """ + GIVEN GIQLEngine with duckdb dialect + WHEN initializing engine + THEN should create connection successfully + """ + engine = GIQLEngine(target_dialect="duckdb") + assert engine.target_dialect == "duckdb" + assert engine.conn is not None + engine.close() + + def test_engine_initialization_sqlite(self): + """ + GIVEN GIQLEngine with sqlite dialect + WHEN initializing engine + THEN should create connection successfully + """ + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + engine = GIQLEngine(target_dialect="sqlite", db_path=tmp.name) + assert engine.target_dialect == "sqlite" + assert engine.conn is not None + engine.close() + + def test_engine_context_manager(self): + """ + GIVEN GIQLEngine used as context manager + WHEN exiting context + THEN should close connection automatically + """ + with GIQLEngine() as engine: + assert engine.conn is not None + + def test_load_csv_and_query_duckdb(self, tmp_path, to_df): + """ + GIVEN CSV data loaded into DuckDB + WHEN executing GIQL query + THEN should return correct results + """ + # Create sample CSV + csv_content = """id,chromosome,start_pos,end_pos,ref,alt +1,chr1,1500,1600,A,T +2,chr1,10500,10600,G,C +3,chr2,500,600,C,G +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", str(csv_path)) + + # Query using INTERSECTS + cursor = engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + result = to_df(cursor) + + assert len(result) == 1 + assert result.iloc[0]["id"] == 1 + + def test_load_csv_and_query_sqlite(self, tmp_path, to_df): + """ + GIVEN CSV data loaded into SQLite + WHEN executing GIQL query + THEN should return correct results + """ + # Create sample CSV + csv_content = """id,chromosome,start_pos,end_pos,ref,alt +1,chr1,1500,1600,A,T +2,chr1,10500,10600,G,C +3,chr2,500,600,C,G +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="sqlite") as engine: + engine.load_csv("variants", str(csv_path)) + + # Query using INTERSECTS + result = to_df( + engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + ) + + assert len(result) == 1 + assert result.iloc[0]["id"] == 1 + + def test_intersects_any_query(self, tmp_path, to_df): + """ + GIVEN variants data + WHEN querying with INTERSECTS ANY + THEN should return variants overlapping any range + """ + csv_content = """id,chromosome,start_pos,end_pos +1,chr1,1500,1600 +2,chr1,10500,10600 +3,chr2,500,600 +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", str(csv_path)) + + result = to_df( + engine.execute( + "SELECT * FROM variants " + "WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:400-700')" + ) + ) + + assert len(result) == 2 + assert set(result["id"]) == {1, 3} + + def test_contains_query(self, tmp_path, to_df): + """ + GIVEN variants data + WHEN querying with CONTAINS + THEN should return variants containing the point + """ + csv_content = """id,chromosome,start_pos,end_pos +1,chr1,1500,1600 +2,chr1,10500,10600 +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", str(csv_path)) + + result = to_df( + engine.execute( + "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1550'" + ) + ) + + assert len(result) == 1 + assert result.iloc[0]["id"] == 1 + + def test_within_query(self, tmp_path, to_df): + """ + GIVEN variants data + WHEN querying with WITHIN + THEN should return variants within the range + """ + csv_content = """id,chromosome,start_pos,end_pos +1,chr1,1500,1600 +2,chr1,10500,10600 +3,chr1,15000,15100 +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", str(csv_path)) + + result = to_df( + engine.execute( + "SELECT * FROM variants WHERE interval WITHIN 'chr1:1000-11000'" + ) + ) + + assert len(result) == 2 + assert set(result["id"]) == {1, 2} + + def test_verbose_mode(self, tmp_path, to_df): + """ + GIVEN engine with verbose mode + WHEN executing query + THEN should print transpiled SQL + """ + csv_content = """id,chromosome,start_pos,end_pos +1,chr1,1500,1600 +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: + engine.load_csv("variants", str(csv_path)) + result = to_df( + engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + ) + assert len(result) == 1 + + @given( + chrom_col=st.sampled_from(["chromosome", "chr", "chrom", "contig", "seqname"]), + start_col=st.sampled_from(["start_pos", "start", "begin", "pos", "chromStart"]), + end_col=st.sampled_from(["end_pos", "end", "stop", "chromEnd"]), + strand_col=st.sampled_from(["strand", "str", "orientation", "direction"]), + ) + def test_custom_genomic_columns( + self, chrom_col, start_col, end_col, strand_col, to_df + ): + """ + GIVEN CSV data with custom genomic column names + WHEN registering schema with custom column mappings + THEN queries should work correctly with any valid column names + """ + # Create temporary directory and CSV with custom column names + with tempfile.TemporaryDirectory() as tmp_dir: + csv_content = f"""id,{chrom_col},{start_col},{end_col},{strand_col},name +1,chr1,1500,1600,+,variant1 +2,chr1,10500,10600,-,variant2 +3,chr2,500,600,+,variant3 +4,chr1,1400,1700,+,variant4 +""" + csv_path = f"{tmp_dir}/custom_variants.csv" + with open(csv_path, "w") as f: + f.write(csv_content) + + with GIQLEngine(target_dialect="duckdb", verbose=False) as engine: + engine.load_csv("variants", csv_path) + + # Register schema with custom column names + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + chrom_col: "VARCHAR", + start_col: "BIGINT", + end_col: "BIGINT", + strand_col: "VARCHAR", + "name": "VARCHAR", + }, + genomic_column="interval", + chrom_col=chrom_col, + start_col=start_col, + end_col=end_col, + strand_col=strand_col, + ) + + # Test INTERSECTS query + result = to_df( + engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + ) + assert len(result) == 2 + assert set(result["id"]) == {1, 4} + + # Test CLUSTER query (uses genomic columns internally) + result = to_df( + engine.execute( + "SELECT *, CLUSTER(interval) AS cluster_id FROM variants ORDER BY id" + ) + ) + assert len(result) == 4 + # Variants 1 and 4 should cluster together (overlapping on chr1) + assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"] + # Variant 2 should be in different cluster (no overlap) + assert result.iloc[1]["cluster_id"] != result.iloc[0]["cluster_id"] + + # Test stranded CLUSTER query + result = to_df( + engine.execute("""SELECT *, CLUSTER(interval, stranded=TRUE) AS cluster_id + FROM variants ORDER BY id""") + ) + assert len(result) == 4 + # With stranded=TRUE, variants 1 and 4 should cluster together (both + and overlapping) + assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"] + # Note: cluster_ids are independent per (chromosome, strand) partition + # So variants on different strands CAN have the same cluster_id number + assert "cluster_id" in result.columns + + # Test MERGE query + result = to_df(engine.execute("SELECT MERGE(interval) FROM variants")) + # Should merge overlapping intervals + assert len(result) >= 1 + + @given( + # Table 1 (variants) column names + v_chrom_col=st.sampled_from(["chromosome", "chr", "chrom"]), + v_start_col=st.sampled_from(["start_pos", "start", "begin"]), + v_end_col=st.sampled_from(["end_pos", "end", "stop"]), + # Table 2 (features) column names (use different names to ensure they're distinct) + f_chrom_col=st.sampled_from(["seqname", "contig", "chr_name"]), + f_start_col=st.sampled_from(["pos", "chromStart", "feature_start"]), + f_end_col=st.sampled_from(["chromEnd", "feature_end", "terminus"]), + ) + @settings(deadline=None) + def test_join_with_different_schemas( + self, + v_chrom_col, + v_start_col, + v_end_col, + f_chrom_col, + f_start_col, + f_end_col, + to_df, + ): + """ + GIVEN two tables with different custom genomic column schemas + WHEN joining them using INTERSECTS + THEN queries should correctly use each table's custom column names + """ + with tempfile.TemporaryDirectory() as tmp_dir: + # Create variants table CSV + variants_csv = f"""id,{v_chrom_col},{v_start_col},{v_end_col},name +1,chr1,1500,1600,var1 +2,chr1,10500,10600,var2 +3,chr2,500,600,var3 +""" + variants_path = f"{tmp_dir}/variants.csv" + with open(variants_path, "w") as f: + f.write(variants_csv) + + # Create features table CSV with DIFFERENT column names + features_csv = f"""id,{f_chrom_col},{f_start_col},{f_end_col},type +1,chr1,1000,2000,exon +2,chr1,10000,11000,intron +3,chr2,400,700,promoter +""" + features_path = f"{tmp_dir}/features.csv" + with open(features_path, "w") as f: + f.write(features_csv) + + with GIQLEngine(target_dialect="duckdb", verbose=False) as engine: + # Load both tables + engine.load_csv("variants", variants_path) + engine.load_csv("features", features_path) + + # Register schemas with different column names + engine.register_table_schema( + "variants", + { + "id": "INTEGER", + v_chrom_col: "VARCHAR", + v_start_col: "BIGINT", + v_end_col: "BIGINT", + "name": "VARCHAR", + }, + genomic_column="interval", + chrom_col=v_chrom_col, + start_col=v_start_col, + end_col=v_end_col, + ) + + engine.register_table_schema( + "features", + { + "id": "INTEGER", + f_chrom_col: "VARCHAR", + f_start_col: "BIGINT", + f_end_col: "BIGINT", + "type": "VARCHAR", + }, + genomic_column="region", + chrom_col=f_chrom_col, + start_col=f_start_col, + end_col=f_end_col, + ) + + # Test JOIN with INTERSECTS on both tables + result = to_df( + engine.execute(""" + SELECT v.name, f.type + FROM variants v + JOIN features f ON v.interval INTERSECTS f.region + ORDER BY v.id + """) + ) + + # Variant 1 (chr1:1500-1600) intersects Feature 1 (chr1:1000-2000) + # Variant 2 (chr1:10500-10600) intersects Feature 2 (chr1:10000-11000) + # Variant 3 (chr2:500-600) intersects Feature 3 (chr2:400-700) + assert len(result) == 3 + assert list(result["name"]) == ["var1", "var2", "var3"] + assert list(result["type"]) == ["exon", "intron", "promoter"] + + # Test LEFT JOIN to verify schema resolution works + result = to_df( + engine.execute(""" + SELECT v.id, v.name, f.type + FROM variants v + LEFT JOIN features f ON v.interval INTERSECTS f.region + WHERE v.id = 1 + """) + ) + assert len(result) == 1 + assert result.iloc[0]["name"] == "var1" + assert result.iloc[0]["type"] == "exon" + + # Test WHERE clause with INTERSECTS on specific table + result = to_df( + engine.execute(""" + SELECT v.id, v.name + FROM variants v, features f + WHERE v.interval INTERSECTS f.region + AND v.interval INTERSECTS 'chr1:1000-2000' + """) + ) + # Only variant 1 intersects both feature and the specified range + assert len(result) == 1 + assert result.iloc[0]["name"] == "var1" + + def test_transpile_returns_sql_string(self): + """ + GIVEN GIQLEngine with a GIQL query + WHEN calling transpile() + THEN should return SQL string without executing it + """ + with GIQLEngine(target_dialect="duckdb") as engine: + sql = engine.transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + + assert isinstance(sql, str) + assert len(sql) > 0 + assert "SELECT" in sql.upper() + # Should contain genomic comparison logic + assert "chromosome" in sql or "start_pos" in sql or "end_pos" in sql + + def test_transpile_different_dialects(self): + """ + GIVEN GIQLEngine with different SQL dialects + WHEN calling transpile() + THEN should return SQL appropriate for each dialect + """ + query = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + + for dialect in ["duckdb", "sqlite"]: + with GIQLEngine(target_dialect=dialect) as engine: + sql = engine.transpile(query) + assert isinstance(sql, str) + assert len(sql) > 0 + assert "SELECT" in sql.upper() + + def test_transpile_verbose_mode(self, tmp_path, capsys): + """ + GIVEN GIQLEngine with verbose mode enabled + WHEN calling transpile() + THEN should print transpilation details + """ + with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: + sql = engine.transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + + captured = capsys.readouterr() + assert "Target Dialect: duckdb" in captured.out + assert "Original GIQL:" in captured.out + assert "Transpiled SQL:" in captured.out + assert isinstance(sql, str) + + def test_execute_uses_transpile(self, tmp_path, to_df): + """ + GIVEN GIQLEngine after refactoring + WHEN calling execute() + THEN should use transpile() internally and execute correctly + """ + csv_content = """id,chromosome,start_pos,end_pos +1,chr1,1500,1600 +2,chr1,10500,10600 +""" + csv_path = tmp_path / "variants.csv" + csv_path.write_text(csv_content) + + with GIQLEngine(target_dialect="duckdb") as engine: + engine.load_csv("variants", str(csv_path)) + + # execute() should internally call transpile() + cursor = engine.execute( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ) + result = to_df(cursor) + + assert len(result) == 1 + assert result.iloc[0]["id"] == 1 diff --git a/tests/test_generator.py b/tests/test_generator.py new file mode 100644 index 0000000..71620d7 --- /dev/null +++ b/tests/test_generator.py @@ -0,0 +1,165 @@ +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator +from giql.generators import GIQLDuckDBGenerator + + +class TestBaseGenerator: + def test_generate_simple_intersects(self): + """ + GIVEN a SQL query with INTERSECTS operator + WHEN generating SQL code + THEN should produce standard SQL with range conditions + """ + sql = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Should expand to chromosome/position checks + assert "\"chromosome\" = 'chr1'" in output + assert '"start_pos" < 2000' in output + assert '"end_pos" > 1000' in output + + def test_generate_contains(self): + """ + GIVEN a SQL query with CONTAINS operator + WHEN generating SQL code + THEN should produce containment conditions + """ + sql = "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1500'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Point query: start <= point < end + assert "\"chromosome\" = 'chr1'" in output + assert '"start_pos" <= 1500' in output + assert '"end_pos" > 1500' in output + + def test_generate_within(self): + """ + GIVEN a SQL query with WITHIN operator + WHEN generating SQL code + THEN should produce within conditions + """ + sql = "SELECT * FROM variants WHERE interval WITHIN 'chr1:1000-5000'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Left within right: start1 >= start2 AND end1 <= end2 + assert "\"chromosome\" = 'chr1'" in output + assert '"start_pos" >= 1000' in output + assert '"end_pos" <= 5000' in output + + def test_generate_intersects_any(self): + """ + GIVEN a SQL query with INTERSECTS ANY operator + WHEN generating SQL code + THEN should produce OR conditions + """ + sql = ( + "SELECT * FROM v WHERE interval INTERSECTS ANY(" + "'chr1:1000-2000', 'chr1:5000-6000')" + ) + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Should have two conditions combined with OR + assert " OR " in output + assert output.count("\"chromosome\" = 'chr1'") == 2 + + def test_generate_intersects_all(self): + """ + GIVEN a SQL query with INTERSECTS ALL operator + WHEN generating SQL code + THEN should produce AND conditions + """ + sql = ( + "SELECT * FROM v WHERE interval INTERSECTS ALL(" + "'chr1:1000-2000', 'chr1:1500-1800')" + ) + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Should have two conditions combined with AND + assert " AND " in output + assert output.count("\"chromosome\" = 'chr1'") == 2 + + def test_generate_with_table_alias(self): + """ + GIVEN a SQL query with table alias + WHEN generating SQL code + THEN should properly qualify column names + """ + sql = "SELECT * FROM variants v WHERE v.interval INTERSECTS 'chr1:1000-2000'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Should use table alias in generated conditions + assert 'v."chromosome"' in output + assert 'v."start_pos"' in output + assert 'v."end_pos"' in output + + def test_contains_range_query(self): + """ + GIVEN a SQL query with CONTAINS on a range (not a point) + WHEN generating SQL code + THEN should use range containment logic + """ + sql = "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1500-2000'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + output = generator.generate(ast) + + # Range containment: start1 <= start2 AND end1 >= end2 + assert "\"chromosome\" = 'chr1'" in output + assert '"start_pos" <= 1500' in output + assert '"end_pos" >= 2000' in output + + def test_invalid_range_string(self): + """ + GIVEN a SQL query with invalid range format + WHEN generating SQL code + THEN should raise ValueError + """ + sql = "SELECT * FROM variants WHERE interval INTERSECTS 'invalid'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = BaseGIQLGenerator() + try: + _ = generator.generate(ast) + assert False, "Should have raised ValueError" + except ValueError as e: + assert "Could not parse genomic range" in str(e) + + +class TestDuckDBGenerator: + def test_duckdb_generator_basic(self): + """ + GIVEN a SQL query with INTERSECTS operator + WHEN using DuckDB generator + THEN should produce valid DuckDB SQL + """ + sql = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ast = parse_one(sql, dialect=GIQLDialect) + + generator = GIQLDuckDBGenerator() + output = generator.generate(ast) + + # Should still have the basic range conditions + assert "\"chromosome\" = 'chr1'" in output + assert '"start_pos" < 2000' in output + assert '"end_pos" > 1000' in output diff --git a/tests/test_nearest_edge_cases.py b/tests/test_nearest_edge_cases.py new file mode 100644 index 0000000..31556da --- /dev/null +++ b/tests/test_nearest_edge_cases.py @@ -0,0 +1,633 @@ +"""Edge case tests for NEAREST operator. + +Tests verify correct handling of boundary conditions, error cases, +and unusual inputs for the NEAREST operator. +""" + +import pytest +from hypothesis import assume +from hypothesis import given +from hypothesis import strategies as st + +from giql import GIQLEngine + + +@pytest.fixture +def duckdb_engine_with_edge_case_data(): + """Create DuckDB engine with data designed for edge case testing.""" + engine = GIQLEngine(target_dialect="duckdb") + + # Create peaks table + engine.conn.execute(""" + CREATE TABLE peaks ( + peak_id INTEGER, + chromosome VARCHAR, + start_pos INTEGER, + end_pos INTEGER + ) + """) + + # Create genes table + engine.conn.execute(""" + CREATE TABLE genes ( + gene_id INTEGER, + gene_name VARCHAR, + chromosome VARCHAR, + start_pos INTEGER, + end_pos INTEGER + ) + """) + + # Insert test data + # Peak 1: chr1:1000-1100 + # Peak 2: chr2:5000-5100 (different chromosome, no genes) + # Peak 3: chr1:10000-10100 + engine.conn.execute(""" + INSERT INTO peaks VALUES + (1, 'chr1', 1000, 1100), + (2, 'chr2', 5000, 5100), + (3, 'chr1', 10000, 10100) + """) + + # Genes with specific distance relationships + # GENE_A and GENE_B are both 500bp from Peak 1 (tie scenario) + # GENE_C overlaps Peak 1 (distance=0) + # GENE_D, GENE_E, GENE_F on chr1 but far from Peak 3 + engine.conn.execute(""" + INSERT INTO genes VALUES + (1, 'GENE_A', 'chr1', 1600, 1700), + (2, 'GENE_B', 'chr1', 400, 500), + (3, 'GENE_C', 'chr1', 1050, 1150), + (4, 'GENE_D', 'chr1', 10500, 10600), + (5, 'GENE_E', 'chr1', 11000, 11100), + (6, 'GENE_F', 'chr1', 12000, 12100) + """) + + # Register schema + engine.register_table_schema( + "peaks", + { + "peak_id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + engine.register_table_schema( + "genes", + { + "gene_id": "INTEGER", + "gene_name": "VARCHAR", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + + return engine + + +class TestNearestEdgeCases: + """Edge case tests for NEAREST operator.""" + + def test_k_equals_zero(self, duckdb_engine_with_edge_case_data): + """ + GIVEN a NEAREST query with k=0 + WHEN executing the query + THEN should return no results (LIMIT 0) + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=0) AS nearest + WHERE peaks.peak_id = 1 + """) + + rows = cursor.fetchall() + assert len(rows) == 0, "k=0 should return no results" + + def test_ties_multiple_features_same_distance( + self, duckdb_engine_with_edge_case_data + ): + """ + GIVEN multiple genes at the same distance from a peak + WHEN querying for k=1 nearest + THEN should return at least 1 result (behavior may vary for ties) + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + WHERE peaks.peak_id = 1 + ORDER BY nearest.distance, nearest.gene_name + """) + + rows = cursor.fetchall() + + # Should have at least 1 result + assert len(rows) >= 1, "Should return at least one result for k=1" + + # All results should be at the same distance (ties) + # Note: GENE_A and GENE_B are both 500bp away, GENE_C overlaps (0bp) + # So the closest should be GENE_C at distance 0 + assert rows[0][1] == "GENE_C", ( + f"Closest gene should be GENE_C (overlapping), got {rows[0][1]}" + ) + assert rows[0][2] == 0, f"Distance should be 0 (overlap), got {rows[0][2]}" + + def test_empty_result_set_different_chromosome( + self, duckdb_engine_with_edge_case_data + ): + """ + GIVEN a peak on a chromosome with no genes + WHEN querying for nearest genes + THEN should return empty result set + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest + WHERE peaks.peak_id = 2 + """) + + rows = cursor.fetchall() + + # Peak 2 is on chr2, but all genes are on chr1 + # Should return empty result set + assert len(rows) == 0, ( + "Should return empty result for peak on chromosome with no genes" + ) + + def test_overlapping_features_distance_zero(self, duckdb_engine_with_edge_case_data): + """ + GIVEN a gene that overlaps a peak + WHEN querying for nearest genes + THEN should return distance=0 for overlapping gene + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest + WHERE peaks.peak_id = 1 + ORDER BY nearest.distance + """) + + rows = cursor.fetchall() + + # GENE_C (chr1:1050-1150) overlaps Peak 1 (chr1:1000-1100) + assert len(rows) > 0, "Should find genes" + + # First result should be the overlapping gene with distance 0 + assert rows[0][1] == "GENE_C", ( + f"First result should be GENE_C (overlapping), got {rows[0][1]}" + ) + assert rows[0][2] == 0, ( + f"Distance should be 0 for overlapping gene, got {rows[0][2]}" + ) + + def test_missing_reference_in_standalone_mode( + self, duckdb_engine_with_edge_case_data + ): + """ + GIVEN a standalone NEAREST query without reference parameter + WHEN parsing/executing the query + THEN should raise an error (reference is required in standalone mode) + """ + engine = duckdb_engine_with_edge_case_data + + # Standalone mode (FROM NEAREST(...)) without reference parameter + # This should fail because we can't determine the reference point + with pytest.raises(Exception) as exc_info: + engine.execute(""" + SELECT * + FROM NEAREST(genes, k=3) + """) + + # Should get an error about missing reference + # The exact error message may vary, but it should mention reference + error_msg = str(exc_info.value).lower() + # Could be a ValueError, AttributeError, or SQL error depending on where it fails + # Just verify it fails - the specific error type will be improved in T065 + + def test_missing_target_table_in_schema(self, duckdb_engine_with_edge_case_data): + """ + GIVEN a NEAREST query referencing a non-existent table + WHEN executing the query + THEN should raise an error about missing table + """ + engine = duckdb_engine_with_edge_case_data + + # Query references 'nonexistent_table' which doesn't exist + with pytest.raises(Exception) as exc_info: + engine.execute(""" + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(nonexistent_table, reference=peaks.interval, k=3) AS nearest + """) + + # Should get an error about the missing table + error_msg = str(exc_info.value).lower() + # DuckDB should raise an error about the table not existing + + def test_invalid_literal_range_format(self, duckdb_engine_with_edge_case_data): + """ + GIVEN a NEAREST query with invalid literal range format + WHEN parsing/executing the query + THEN should raise an error about invalid range format + """ + engine = duckdb_engine_with_edge_case_data + + # Invalid range formats + # Note: "chr1:1000" is valid (point format), so not included + invalid_ranges = [ + "chr1:not-a-number", # Non-numeric coordinates + "invalid-format", # No colon separator + "chr1:2000-1000", # End before start (start >= end) + ] + + for invalid_range in invalid_ranges: + with pytest.raises(ValueError) as exc_info: + engine.execute(f""" + SELECT * + FROM NEAREST(genes, reference='{invalid_range}', k=3) + """) + + # Should get a ValueError about invalid range format + error_msg = str(exc_info.value).lower() + assert "invalid" in error_msg or "must be less" in error_msg, ( + f"Error message should mention invalid format or start/end issue: {exc_info.value}" + ) + + def test_nearest_with_additional_where_clause( + self, duckdb_engine_with_edge_case_data + ): + """ + GIVEN a NEAREST query with additional WHERE clause filtering + WHEN executing the query + THEN should apply both NEAREST and WHERE filters + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest + WHERE peaks.peak_id = 1 AND nearest.distance < 600 + ORDER BY nearest.distance + """) + + rows = cursor.fetchall() + + # Should find genes within 600bp of Peak 1 + # GENE_C overlaps (0bp) and GENE_A/GENE_B are 500bp away + assert len(rows) >= 1, "Should find genes within 600bp" + + # All returned genes should have distance < 600 + for row in rows: + assert row[2] < 600, f"All distances should be < 600bp, got {row[2]}" + + def test_nearest_with_cte(self, duckdb_engine_with_edge_case_data): + """ + GIVEN a NEAREST query using a CTE for multiple query points + WHEN executing the query + THEN should correctly handle NEAREST within CTE + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + WITH selected_peaks AS ( + SELECT * FROM peaks WHERE peak_id IN (1, 3) + ) + SELECT + selected_peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM selected_peaks + CROSS JOIN LATERAL NEAREST(genes, reference=selected_peaks.interval, k=2) AS nearest + ORDER BY selected_peaks.peak_id, nearest.distance + """) + + rows = cursor.fetchall() + + # Should find 2 nearest genes for each of 2 peaks = up to 4 results + assert len(rows) > 0, "Should find genes for peaks in CTE" + + # Check that we have results for both peaks + peak_ids = set(row[0] for row in rows) + assert 1 in peak_ids, "Should have results for peak 1" + assert 3 in peak_ids, "Should have results for peak 3" + + def test_k_greater_than_total_features_all_chromosomes( + self, duckdb_engine_with_edge_case_data + ): + """ + GIVEN k greater than total number of features on the same chromosome + WHEN querying for nearest genes + THEN should return all available features on that chromosome + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1000) AS nearest + WHERE peaks.peak_id = 1 + """) + + rows = cursor.fetchall() + + # Peak 1 is on chr1, and there are 6 genes on chr1 + # Should return all 6 genes, not 1000 + assert len(rows) == 6, f"Should return all 6 genes on chr1, got {len(rows)}" + + def test_ties_with_k_greater_than_one(self, duckdb_engine_with_edge_case_data): + """ + GIVEN multiple features at the same distance (ties) + WHEN querying with k that includes tied features + THEN should handle ties consistently + """ + engine = duckdb_engine_with_edge_case_data + + cursor = engine.execute(""" + SELECT + peaks.peak_id, + nearest.gene_name, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + WHERE peaks.peak_id = 1 + ORDER BY nearest.distance, nearest.gene_name + """) + + rows = cursor.fetchall() + + # Peak 1 has: + # - GENE_C at 0bp (overlap) + # - GENE_A and GENE_B both at 500bp (tie) + # With k=3, should get all 3 + + assert len(rows) == 3, f"Should return 3 nearest genes, got {len(rows)}" + + # First should be GENE_C (distance 0) + assert rows[0][1] == "GENE_C" + assert rows[0][2] == 0 + + # Next two should be GENE_A and GENE_B (distance 500, order may vary) + gene_names_at_500 = [rows[1][1], rows[2][1]] + assert set(gene_names_at_500) == {"GENE_A", "GENE_B"}, ( + f"Should have GENE_A and GENE_B at 500bp" + ) + assert rows[1][2] == 500 + assert rows[2][2] == 500 + + +class TestNearestPropertyBased: + """Property-based tests for NEAREST operator using Hypothesis.""" + + @given( + start1=st.integers(min_value=0, max_value=100000), + length1=st.integers(min_value=1, max_value=1000), + start2=st.integers(min_value=0, max_value=100000), + length2=st.integers(min_value=1, max_value=1000), + ) + def test_distance_non_negative_for_non_overlapping( + self, start1, length1, start2, length2 + ): + """ + PROPERTY: Distance between non-overlapping intervals is always non-negative + GIVEN two non-overlapping genomic intervals + WHEN calculating distance using NEAREST + THEN distance should be >= 0 + """ + end1 = start1 + length1 + end2 = start2 + length2 + + # Skip if intervals overlap + assume(not (start1 < end2 and end1 > start2)) + + engine = GIQLEngine(target_dialect="duckdb") + + # Create tables + engine.conn.execute(""" + CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + engine.conn.execute(""" + CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + + # Insert test data + engine.conn.execute(f""" + INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1}) + """) + engine.conn.execute(f""" + INSERT INTO target VALUES (1, 'chr1', {start2}, {end2}) + """) + + # Register schema + engine.register_table_schema( + "ref", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + engine.register_table_schema( + "target", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + + # Query for nearest + cursor = engine.execute(""" + SELECT nearest.distance + FROM ref + CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest + """) + + rows = cursor.fetchall() + if len(rows) > 0: + distance = rows[0][0] + assert distance >= 0, f"Distance should be non-negative, got {distance}" + + @given( + start1=st.integers(min_value=0, max_value=100000), + length1=st.integers(min_value=1, max_value=1000), + overlap_start=st.integers(min_value=1, max_value=500), + ) + def test_overlapping_intervals_have_zero_distance( + self, start1, length1, overlap_start + ): + """ + PROPERTY: Overlapping intervals have distance 0 + GIVEN two genomic intervals that overlap + WHEN calculating distance using NEAREST + THEN distance should be 0 + """ + end1 = start1 + length1 + # Create overlapping interval + start2 = start1 + overlap_start + end2 = start2 + length1 + + # Ensure they actually overlap + assume(start1 < end2 and end1 > start2) + + engine = GIQLEngine(target_dialect="duckdb") + + # Create tables + engine.conn.execute(""" + CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + engine.conn.execute(""" + CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + + # Insert test data + engine.conn.execute(f""" + INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1}) + """) + engine.conn.execute(f""" + INSERT INTO target VALUES (1, 'chr1', {start2}, {end2}) + """) + + # Register schema + engine.register_table_schema( + "ref", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + engine.register_table_schema( + "target", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + + # Query for nearest + cursor = engine.execute(""" + SELECT nearest.distance + FROM ref + CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest + """) + + rows = cursor.fetchall() + assert len(rows) > 0, "Should find overlapping interval" + distance = rows[0][0] + assert distance == 0, ( + f"Overlapping intervals should have distance 0, got {distance}" + ) + + @given( + k=st.integers(min_value=1, max_value=10), + n_features=st.integers(min_value=0, max_value=15), + ) + def test_k_parameter_returns_at_most_k_results(self, k, n_features): + """ + PROPERTY: k parameter limits results to at most k features + GIVEN k parameter and n available features + WHEN querying for k nearest + THEN should return min(k, n) results + """ + engine = GIQLEngine(target_dialect="duckdb") + + # Create tables + engine.conn.execute(""" + CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + engine.conn.execute(""" + CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) + """) + + # Insert reference point + engine.conn.execute(""" + INSERT INTO ref VALUES (1, 'chr1', 1000, 1100) + """) + + # Insert n_features target features + for i in range(n_features): + # Spread features out to avoid ties + start = 2000 + (i * 500) + end = start + 100 + engine.conn.execute(f""" + INSERT INTO target VALUES ({i}, 'chr1', {start}, {end}) + """) + + # Register schema + engine.register_table_schema( + "ref", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + engine.register_table_schema( + "target", + { + "id": "INTEGER", + "chromosome": "VARCHAR", + "start_pos": "INTEGER", + "end_pos": "INTEGER", + }, + genomic_column="interval", + ) + + # Query for k nearest + cursor = engine.execute(f""" + SELECT COUNT(*) + FROM ref + CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k={k}) AS nearest + """) + + rows = cursor.fetchall() + count = rows[0][0] + + # Should return at most k results, but not more than available features + expected_count = min(k, n_features) + assert count == expected_count, ( + f"Expected {expected_count} results (min({k}, {n_features})), got {count}" + ) diff --git a/tests/test_nearest_parsing.py b/tests/test_nearest_parsing.py new file mode 100644 index 0000000..38a5039 --- /dev/null +++ b/tests/test_nearest_parsing.py @@ -0,0 +1,198 @@ +"""Parser tests for NEAREST operator syntax. + +Tests verify that the GIQL parser correctly recognizes and parses +NEAREST function calls with various argument patterns. +""" + +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.expressions import GIQLNearest + + +class TestNearestParsing: + """Tests for parsing NEAREST function syntax.""" + + def test_parse_nearest_basic_syntax(self): + """ + GIVEN a GIQL query with NEAREST(genes, k=3) + WHEN parsing the query + THEN should create GIQLNearest AST node with correct arguments + """ + sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Find the NEAREST expression in the JOIN clause + # Navigate: Select -> joins[0] -> this (which should be NEAREST) + joins = ast.args.get("joins") + assert joins is not None, "Expected JOIN clause" + + join = joins[0] + lateral_expr = join.this + + # For LATERAL, the function is nested inside + if hasattr(lateral_expr, "this"): + nearest_expr = lateral_expr.this + else: + nearest_expr = lateral_expr + + assert isinstance(nearest_expr, GIQLNearest), ( + f"Expected GIQLNearest node, got {type(nearest_expr)}" + ) + + # Verify target table argument + assert nearest_expr.this is not None, "Missing target table argument (this)" + + # Verify k parameter + k_param = nearest_expr.args.get("k") + assert k_param is not None, "Missing k parameter" + + def test_parse_nearest_with_literal_reference(self): + """ + GIVEN a GIQL query with NEAREST(genes, reference='chr1:1000-2000', k=3) + WHEN parsing the query + THEN should create GIQLNearest node with literal reference parameter + """ + sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Navigate to NEAREST function in FROM clause + # When a function is in FROM, sqlglot wraps it in a Table expression + from_clause = ast.args.get("from_") + table_expr = from_clause.this + + # The NEAREST expression should be nested in the Table's 'this' + nearest_expr = table_expr.this if hasattr(table_expr, "this") else table_expr + + assert isinstance(nearest_expr, GIQLNearest), ( + f"Expected GIQLNearest node, got {type(nearest_expr)}" + ) + + # Verify reference parameter exists + reference = nearest_expr.args.get("reference") + assert reference is not None, "Missing reference parameter" + + # Verify k parameter + k_param = nearest_expr.args.get("k") + assert k_param is not None, "Missing k parameter" + + def test_parse_nearest_with_max_distance(self): + """ + GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000) + WHEN parsing the query + THEN should parse max_distance parameter correctly + """ + sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=5, max_distance=100000)" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Navigate to NEAREST + joins = ast.args.get("joins") + join = joins[0] + lateral_expr = join.this + + if hasattr(lateral_expr, "this"): + nearest_expr = lateral_expr.this + else: + nearest_expr = lateral_expr + + assert isinstance(nearest_expr, GIQLNearest), f"Expected GIQLNearest node" + + # Verify max_distance parameter + max_distance = nearest_expr.args.get("max_distance") + assert max_distance is not None, "Missing max_distance parameter" + + def test_parse_nearest_with_stranded(self): + """ + GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true) + WHEN parsing the query + THEN should parse stranded parameter correctly + """ + sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3, stranded=true)" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Navigate to NEAREST + joins = ast.args.get("joins") + join = joins[0] + lateral_expr = join.this + + if hasattr(lateral_expr, "this"): + nearest_expr = lateral_expr.this + else: + nearest_expr = lateral_expr + + assert isinstance(nearest_expr, GIQLNearest), f"Expected GIQLNearest node" + + # Verify stranded parameter + stranded = nearest_expr.args.get("stranded") + assert stranded is not None, "Missing stranded parameter" + + def test_parse_nearest_all_parameters(self): + """ + GIVEN a GIQL query with all NEAREST parameters + WHEN parsing the query + THEN should parse all parameters correctly + """ + sql = """ + SELECT * FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true + ) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + + # Navigate to NEAREST + joins = ast.args.get("joins") + join = joins[0] + lateral_expr = join.this + + if hasattr(lateral_expr, "this"): + nearest_expr = lateral_expr.this + else: + nearest_expr = lateral_expr + + assert isinstance(nearest_expr, GIQLNearest), f"Expected GIQLNearest node" + + # Verify all parameters exist + assert nearest_expr.this is not None, "Missing target table" + assert nearest_expr.args.get("reference") is not None, "Missing reference" + assert nearest_expr.args.get("k") is not None, "Missing k" + assert nearest_expr.args.get("max_distance") is not None, "Missing max_distance" + assert nearest_expr.args.get("stranded") is not None, "Missing stranded" + + def test_parse_nearest_with_strand_notation(self): + """ + GIVEN a GIQL query with literal reference using strand notation + WHEN parsing 'chr1:1000-2000:+' format + THEN should parse the strand-annotated range correctly + """ + sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000:+', k=3)" + + ast = parse_one(sql, dialect=GIQLDialect) + + # Navigate to NEAREST function in FROM clause + from_clause = ast.args.get("from_") + table_expr = from_clause.this + + # The NEAREST expression should be nested in the Table's 'this' + nearest_expr = table_expr.this if hasattr(table_expr, "this") else table_expr + + assert isinstance(nearest_expr, GIQLNearest), ( + f"Expected GIQLNearest node, got {type(nearest_expr)}" + ) + + # Verify reference parameter exists and contains strand notation + reference = nearest_expr.args.get("reference") + assert reference is not None, "Missing reference parameter" + + # Verify k parameter + k_param = nearest_expr.args.get("k") + assert k_param is not None, "Missing k parameter" diff --git a/tests/test_nearest_transpilation.py b/tests/test_nearest_transpilation.py new file mode 100644 index 0000000..0ff8abb --- /dev/null +++ b/tests/test_nearest_transpilation.py @@ -0,0 +1,267 @@ +"""Transpilation tests for NEAREST operator SQL generation. + +Tests verify that NEAREST() is correctly transpiled to dialect-specific SQL +(LATERAL joins for PostgreSQL/DuckDB, window functions for SQLite). +""" + +import pytest +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator +from giql.generators import GIQLDuckDBGenerator +from giql.schema import ColumnInfo +from giql.schema import SchemaInfo +from giql.schema import TableSchema + + +@pytest.fixture +def schema_with_peaks_and_genes(): + """Schema info with peaks and genes tables.""" + schema = SchemaInfo() + + # Register peaks table + peaks_table = TableSchema(name="peaks", columns={}) + peaks_table.columns["peak_id"] = ColumnInfo(name="peak_id", type="INTEGER") + peaks_table.columns["interval"] = ColumnInfo( + name="interval", + type="VARCHAR", + is_genomic=True, + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + strand_col="strand", + ) + schema.tables["peaks"] = peaks_table + + # Register genes table + genes_table = TableSchema(name="genes", columns={}) + genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") + genes_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR") + genes_table.columns["interval"] = ColumnInfo( + name="interval", + type="VARCHAR", + is_genomic=True, + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + strand_col="strand", + ) + schema.tables["genes"] = genes_table + + return schema + + +class TestNearestTranspilationDuckDB: + """Tests for NEAREST transpilation to DuckDB SQL (LATERAL joins).""" + + def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=3) + WHEN transpiling to DuckDB SQL + THEN should generate LATERAL join with DISTANCE and LIMIT 3 + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - LATERAL subquery + # - DISTANCE(...) AS distance in SELECT + # - WHERE peaks.chromosome = genes.chromosome (pre-filter) + # - ORDER BY distance + # - LIMIT 3 + assert "LATERAL" in output.upper() + assert "CASE" in output or "DISTANCE" in output # Distance calculation + assert " AS distance" in output or " as distance" in output.lower() + assert "LIMIT 3" in output + assert "ORDER BY" in output + + def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000) + WHEN transpiling to DuckDB SQL + THEN should generate LATERAL join with distance filter + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5, max_distance=100000) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - LATERAL subquery + # - Distance filter: <= 100000 + # - LIMIT 5 + assert "LATERAL" in output.upper() + assert "100000" in output + assert "LIMIT 5" in output + + def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3) + WHEN transpiling to DuckDB SQL + THEN should generate standalone query without LATERAL + """ + sql = """ + SELECT * + FROM NEAREST(genes, reference='chr1:1000-2000', k=3) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - No LATERAL (standalone mode) + # - Distance calculation with literal 'chr1', 1000, 2000 + # - ORDER BY distance + # - LIMIT 3 + assert "LATERAL" not in output.upper() + assert "chr1" in output.lower() + assert "LIMIT 3" in output + + def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true) + WHEN transpiling to DuckDB SQL + THEN should generate SQL with strand filtering + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3, stranded=true) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - LATERAL subquery + # - Strand filtering in WHERE clause + # - LIMIT 3 + assert "LATERAL" in output.upper() + assert "strand" in output.lower() + assert "LIMIT 3" in output + + +# PostgreSQL uses same generator as base for now +# class TestNearestTranspilationPostgreSQL: +# """Tests for NEAREST transpilation to PostgreSQL SQL (LATERAL joins).""" +# (Skipped - uses BaseGIQLGenerator for now) + + +class TestNearestTranspilationSQLite: + """Tests for NEAREST transpilation to SQLite SQL (using LATERAL for MVP).""" + + def test_nearest_basic_k3_sqlite(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=3) + WHEN transpiling to SQLite SQL + THEN should generate LATERAL subquery with ORDER BY and LIMIT + (Note: Using LATERAL for MVP - window function optimization to be added later) + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # MVP expectations (LATERAL syntax): + # - LATERAL subquery + # - Distance calculation (CASE WHEN) + # - ORDER BY distance + # - LIMIT 3 + assert "LATERAL" in output.upper() + assert "CASE" in output.upper() + assert " AS distance" in output or " AS DISTANCE" in output + assert "ORDER BY" in output.upper() + assert "LIMIT 3" in output + + def test_nearest_with_max_distance_sqlite(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000) + WHEN transpiling to SQLite SQL + THEN should generate LATERAL with distance filter + (Note: Using LATERAL for MVP - window function optimization to be added later) + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5, max_distance=100000) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # MVP expectations (LATERAL syntax): + # - LATERAL subquery + # - Distance filter: <= 100000 + # - LIMIT 5 + assert "LATERAL" in output.upper() + assert "100000" in output + assert "LIMIT 5" in output + + def test_nearest_standalone_literal_sqlite(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3) + WHEN transpiling to SQLite SQL + THEN should generate standalone query without window functions + """ + sql = """ + SELECT * + FROM NEAREST(genes, reference='chr1:1000-2000', k=3) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - No CTE needed (standalone mode) + # - Distance calculation with literal 'chr1', 1000, 2000 + # - ORDER BY distance + # - LIMIT 3 + assert "chr1" in output.lower() + assert "ORDER BY" in output.upper() + assert "LIMIT 3" in output + + def test_nearest_with_stranded_sqlite(self, schema_with_peaks_and_genes): + """ + GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true) + WHEN transpiling to SQLite SQL + THEN should generate SQL with strand filtering + """ + sql = """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3, stranded=true) + """ + + ast = parse_one(sql, dialect=GIQLDialect) + generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + output = generator.generate(ast) + + # Expectations: + # - LATERAL subquery + # - Strand filtering in WHERE clause + # - LIMIT 3 + assert "LATERAL" in output.upper() + assert "strand" in output.lower() + assert "LIMIT 3" in output diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..1536559 --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,124 @@ +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.expressions import Contains +from giql.expressions import Intersects +from giql.expressions import SpatialSetPredicate +from giql.expressions import Within + + +class TestParser: + def test_parse_simple_intersects(self): + """ + GIVEN a SQL query with INTERSECTS operator + WHEN parsing the query + THEN should create an Intersects AST node + """ + sql = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" + ast = parse_one(sql, dialect=GIQLDialect) + + # Find the INTERSECTS node + intersects_node = None + for node in ast.walk(): + if isinstance(node, Intersects): + intersects_node = node + break + + assert intersects_node is not None + + def test_parse_contains(self): + """ + GIVEN a SQL query with CONTAINS operator + WHEN parsing the query + THEN should create a Contains AST node + """ + sql = "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1500'" + ast = parse_one(sql, dialect=GIQLDialect) + + contains_node = None + for node in ast.walk(): + if isinstance(node, Contains): + contains_node = node + break + + assert contains_node is not None + + def test_parse_within(self): + """ + GIVEN a SQL query with WITHIN operator + WHEN parsing the query + THEN should create a Within AST node + """ + sql = "SELECT * FROM variants WHERE interval WITHIN 'chr1:1000-5000'" + ast = parse_one(sql, dialect=GIQLDialect) + + within_node = None + for node in ast.walk(): + if isinstance(node, Within): + within_node = node + break + + assert within_node is not None + + def test_parse_intersects_any(self): + """ + GIVEN a SQL query with INTERSECTS ANY operator + WHEN parsing the query + THEN should create a SpatialSetPredicate with ANY quantifier + """ + sql = ( + "SELECT * FROM v " + "WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000')" + ) + ast = parse_one(sql, dialect=GIQLDialect) + + spatial_set = None + for node in ast.walk(): + if isinstance(node, SpatialSetPredicate): + spatial_set = node + break + + assert spatial_set is not None + assert spatial_set.args["operator"] == "INTERSECTS" + assert spatial_set.args["quantifier"] == "ANY" + + def test_parse_intersects_all(self): + """ + GIVEN a SQL query with INTERSECTS ALL operator + WHEN parsing the query + THEN should create a SpatialSetPredicate with ALL quantifier + """ + sql = ( + "SELECT * FROM v " + "WHERE interval INTERSECTS ALL('chr1:1000-2000', 'chr1:1500-1800')" + ) + ast = parse_one(sql, dialect=GIQLDialect) + + spatial_set = None + for node in ast.walk(): + if isinstance(node, SpatialSetPredicate): + spatial_set = node + break + + assert spatial_set is not None + assert spatial_set.args["operator"] == "INTERSECTS" + assert spatial_set.args["quantifier"] == "ALL" + + def test_parse_contains_any(self): + """ + GIVEN a SQL query with CONTAINS ANY operator + WHEN parsing the query + THEN should create a SpatialSetPredicate with CONTAINS operator + """ + sql = "SELECT * FROM v WHERE interval CONTAINS ANY('chr1:1500', 'chr1:1600')" + ast = parse_one(sql, dialect=GIQLDialect) + + spatial_set = None + for node in ast.walk(): + if isinstance(node, SpatialSetPredicate): + spatial_set = node + break + + assert spatial_set is not None + assert spatial_set.args["operator"] == "CONTAINS" + assert spatial_set.args["quantifier"] == "ANY" diff --git a/tests/test_range_parser.py b/tests/test_range_parser.py new file mode 100644 index 0000000..06e0835 --- /dev/null +++ b/tests/test_range_parser.py @@ -0,0 +1,109 @@ +import pytest + +from giql.range_parser import IntervalType +from giql.range_parser import ParsedRange +from giql.range_parser import RangeParser + + +class TestRangeParser: + def test_parse_simple_range(self): + """ + GIVEN a simple range string + WHEN parsing the range + THEN should return a ParsedRange with correct values + """ + result = RangeParser.parse("chr1:1000-2000") + assert result.chromosome == "chr1" + assert result.start == 1000 + assert result.end == 2000 + assert result.interval_type == IntervalType.HALF_OPEN + assert result.strand is None + + def test_parse_explicit_half_open(self): + """ + GIVEN an explicit half-open range string + WHEN parsing the range + THEN should return a ParsedRange with HALF_OPEN interval type + """ + result = RangeParser.parse("chr1:[1000,2000)") + assert result.interval_type == IntervalType.HALF_OPEN + assert result.end == 2000 + + def test_parse_explicit_closed(self): + """ + GIVEN an explicit closed range string + WHEN parsing the range + THEN should return a ParsedRange with CLOSED interval type + """ + result = RangeParser.parse("chr1:[1001,2000]") + assert result.interval_type == IntervalType.CLOSED + assert result.end == 2000 + + def test_parse_with_strand(self): + """ + GIVEN range strings with strand information + WHEN parsing the ranges + THEN should correctly parse the strand + """ + result = RangeParser.parse("chr1:1000-2000:+") + assert result.strand == "+" + + result = RangeParser.parse("chr1:1000-2000:-") + assert result.strand == "-" + + def test_parse_point(self): + """ + GIVEN a point range string + WHEN parsing the range + THEN should return a ParsedRange representing a single position + """ + result = RangeParser.parse("chr1:1500") + assert result.start == 1500 + assert result.end == 1501 + assert result.interval_type == IntervalType.HALF_OPEN + + def test_to_zero_based_half_open(self): + """ + GIVEN a closed interval ParsedRange + WHEN converting to zero-based half-open + THEN should increment the end position + """ + closed = ParsedRange("chr1", 1001, 2000, IntervalType.CLOSED) + converted = closed.to_zero_based_half_open() + assert converted.end == 2001 + assert converted.interval_type == IntervalType.HALF_OPEN + + def test_range_length(self): + """ + GIVEN ParsedRange objects with different interval types + WHEN calculating length + THEN should return correct length for each type + """ + half_open = ParsedRange("chr1", 1000, 2000, IntervalType.HALF_OPEN) + assert half_open.length() == 1000 + + closed = ParsedRange("chr1", 1000, 2000, IntervalType.CLOSED) + assert closed.length() == 1001 + + def test_invalid_range(self): + """ + GIVEN invalid range strings + WHEN parsing the ranges + THEN should raise ValueError + """ + with pytest.raises(ValueError): + RangeParser.parse("invalid") + + with pytest.raises(ValueError): + RangeParser.parse("chr1:2000-1000") + + def test_chromosome_formats(self): + """ + GIVEN various chromosome naming conventions + WHEN parsing the ranges + THEN should correctly parse all formats + """ + assert RangeParser.parse("chr1:100-200").chromosome == "chr1" + assert RangeParser.parse("1:100-200").chromosome == "1" + assert RangeParser.parse("chrX:100-200").chromosome == "chrX" + assert RangeParser.parse("chrM:100-200").chromosome == "chrM" From ce61dc1b79b38ffef282f844fdf9fc3bab19f401 Mon Sep 17 00:00:00 2001 From: Conrad Date: Mon, 8 Dec 2025 14:54:31 -0500 Subject: [PATCH 02/12] Treat undefined release cycle as stable --- build-hooks/_version.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build-hooks/_version.py b/build-hooks/_version.py index 7d0e90a..a1380c5 100644 --- a/build-hooks/_version.py +++ b/build-hooks/_version.py @@ -314,6 +314,8 @@ def __int__(self) -> int: @classmethod def _missing_(cls, key: int | str) -> Optional[ReleaseCycle]: + if key == -1: + key = 3 return cls.__ReleaseCycle_mapping__.get(key) def render(self) -> str: From 2cde79d0a7624d6718a62ae7f1bc39138eb03a79 Mon Sep 17 00:00:00 2001 From: Conrad Date: Thu, 29 Jan 2026 11:50:19 -0500 Subject: [PATCH 03/12] Remove execution engine and refactor to transpile-only API - Add new public API: `transpile()` function and `Table` dataclass - Replace SchemaInfo/TableSchema/ColumnInfo with simpler Table/Tables types - Remove GIQLEngine, CLI, and dialect-specific generators - Delete integration tests that depended on execution engine - Update all generators and transformers to use new Table types --- pyproject.toml | 31 +- src/giql/__init__.py | 15 +- src/giql/cli.py | 683 ------------------ src/giql/engine.py | 371 ---------- src/giql/generators/__init__.py | 8 +- src/giql/generators/base.py | 147 ++-- src/giql/generators/duckdb.py | 7 - src/giql/generators/sqlite.py | 24 - src/giql/protocols.py | 81 --- src/giql/schema.py | 83 --- src/giql/table.py | 131 ++++ src/giql/transformer.py | 47 +- src/giql/transpile.py | 129 ++++ tests/conftest.py | 164 +---- tests/generators/test_base.py | 406 +++-------- tests/integration/bedtools/__init__.py | 5 - tests/integration/bedtools/conftest.py | 46 -- tests/integration/bedtools/test_intersect.py | 313 -------- tests/integration/bedtools/test_merge.py | 224 ------ tests/integration/bedtools/test_nearest.py | 468 ------------ .../integration/bedtools/test_strand_aware.py | 471 ------------ tests/integration/bedtools/utils/__init__.py | 1 - .../integration/bedtools/utils/bed_export.py | 40 - .../bedtools/utils/bedtools_wrapper.py | 303 -------- .../integration/bedtools/utils/comparison.py | 134 ---- .../integration/bedtools/utils/data_models.py | 259 ------- .../bedtools/utils/interval_generator.py | 425 ----------- tests/test_cluster.py | 441 ----------- tests/test_distance_transpilation.py | 8 +- tests/test_engine.py | 480 ------------ tests/test_nearest_edge_cases.py | 633 ---------------- tests/test_nearest_transpilation.py | 195 +---- tests/test_transpile.py | 411 +++++++++++ 33 files changed, 907 insertions(+), 6277 deletions(-) delete mode 100644 src/giql/cli.py delete mode 100644 src/giql/engine.py delete mode 100644 src/giql/generators/duckdb.py delete mode 100644 src/giql/generators/sqlite.py delete mode 100644 src/giql/protocols.py delete mode 100644 src/giql/schema.py create mode 100644 src/giql/table.py create mode 100644 src/giql/transpile.py delete mode 100644 tests/integration/bedtools/__init__.py delete mode 100644 tests/integration/bedtools/conftest.py delete mode 100644 tests/integration/bedtools/test_intersect.py delete mode 100644 tests/integration/bedtools/test_merge.py delete mode 100644 tests/integration/bedtools/test_nearest.py delete mode 100644 tests/integration/bedtools/test_strand_aware.py delete mode 100644 tests/integration/bedtools/utils/__init__.py delete mode 100644 tests/integration/bedtools/utils/bed_export.py delete mode 100644 tests/integration/bedtools/utils/bedtools_wrapper.py delete mode 100644 tests/integration/bedtools/utils/comparison.py delete mode 100644 tests/integration/bedtools/utils/data_models.py delete mode 100644 tests/integration/bedtools/utils/interval_generator.py delete mode 100644 tests/test_cluster.py delete mode 100644 tests/test_engine.py delete mode 100644 tests/test_nearest_edge_cases.py create mode 100644 tests/test_transpile.py diff --git a/pyproject.toml b/pyproject.toml index 25874b1..766b52f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,13 +15,7 @@ authors = [ { name = "Conrad Bzura", email = "conradbzura@gmail.com" }, ] dependencies = [ - "click>=8.3.0", - "duckdb>=1.4.0", - "oxbow>=0.4.0", - "pandas>=2.0.0", - "psycopg2-binary>=2.9.10", "sqlglot>=20.0.0", - "sqlparse>=0.4.0", ] description = "Genomic Interval Query Language - SQL dialect for genomic range queries" dynamic = ["version"] @@ -33,20 +27,16 @@ name = "giql" readme = "README.md" requires-python = ">=3.11" -[project.scripts] -giql = "giql.cli:cli" - [project.optional-dependencies] -all = [ - "duckdb>=0.9.0", - "mysql-connector-python>=8.0.0", - "psycopg2-binary>=2.9.0", +dev = [ + "duckdb>=1.4.0", + "hypothesis>=6.0.0", + "pandas>=2.0.0", + "pybedtools>=0.9.0", + "pytest-cov>=4.0.0", + "pytest>=7.0.0", + "ruff>=0.1.0", ] -dev = ["pytest-cov>=4.0.0", "pytest>=7.0.0", "ruff>=0.1.0", "hypothesis", "pybedtools"] -duckdb = ["duckdb>=0.9.0"] -mysql = ["mysql-connector-python>=8.0.0"] -postgres = ["psycopg2-binary>=2.9.0"] -sqlite = [] [tool.hatch.metadata.hooks.custom] path = "build-hooks/metadata.py" @@ -79,13 +69,8 @@ bedtools = ">=2.31.0" pybedtools = ">=0.9.0" pytest = ">=7.0.0" pytest-cov = ">=4.0.0" -click = ">=8.3.0" duckdb = ">=1.4.0" pandas = ">=2.0.0" -pyarrow = ">=19.0.0" -psycopg2-binary = ">=2.9.10" sqlglot = ">=20.0.0" pip = "*" -oxbow = ">=0.4.0" -sqlparse = ">=0.4.0" hypothesis = ">=6.148.2,<7" diff --git a/src/giql/__init__.py b/src/giql/__init__.py index e840f17..064f546 100644 --- a/src/giql/__init__.py +++ b/src/giql/__init__.py @@ -1,19 +1,22 @@ """GIQL - Genomic Interval Query Language. -A SQL dialect for genomic range queries with multi-database support. +A SQL dialect for genomic range queries. This package provides: - - GIQL dialect extending SQL with spatial operators - - Query engine supporting multiple backends (DuckDB, SQLite) + - GIQL dialect extending SQL with spatial operators (INTERSECTS, CONTAINS, WITHIN) + - CLUSTER and MERGE operations for interval grouping + - NEAREST operator for finding closest intervals - Range parser for genomic coordinate strings - - Schema management for genomic data + - Transpilation to standard SQL-92 compatible output """ -from giql.engine import GIQLEngine as GIQLEngine +from giql.table import Table +from giql.transpile import transpile __version__ = "0.1.0" __all__ = [ - "GIQLEngine", + "Table", + "transpile", ] diff --git a/src/giql/cli.py b/src/giql/cli.py deleted file mode 100644 index d714075..0000000 --- a/src/giql/cli.py +++ /dev/null @@ -1,683 +0,0 @@ -"""Command-line interface for GIQL. - -This module provides a CLI that mirrors bedtools intersect functionality -using GIQL's genomic query capabilities. -""" - -import sys -from pathlib import Path - -import click -import duckdb -from oxbow import from_bam -from oxbow import from_bed -from oxbow import from_gff -from oxbow import from_gtf -from oxbow import from_vcf - -from giql import GIQLEngine - - -@click.group() -@click.version_option() -def cli(): - """GIQL - Genomic Interval Query Language. - - SQL-based toolkit for genomic range queries. - """ - pass - - -def _detect_file_format(file_path: Path) -> str: - """Detect genomic file format from file extension. - - :param file_path: - Path to the file - :return: - Format identifier: 'bed', 'bam', 'vcf', 'gff', 'gtf' - :raises click.ClickException: - If format cannot be determined - """ - # Handle compressed files - suffixes = file_path.suffixes - if suffixes[-1] == ".gz": - # Remove .gz and check the actual format - ext = suffixes[-2] if len(suffixes) >= 2 else "" - else: - ext = file_path.suffix - - ext = ext.lower() - - format_map = { - ".bed": "bed", - ".bam": "bam", - ".vcf": "vcf", - ".gff": "gff", - ".gff3": "gff", - ".gtf": "gtf", - } - - if ext in format_map: - return format_map[ext] - - raise click.ClickException( - f"Unsupported file format: {ext}. Supported formats: BED, BAM, VCF, GFF, GTF" - ) - - -def _load_genomic_file( - conn: duckdb.DuckDBPyConnection, file_path: Path, table_name: str -) -> dict[str, str]: - """Load genomic file using appropriate oxbow function. - - :param conn: - DuckDB connection - :param file_path: - Path to genomic file - :param table_name: - Name for the table to create - :return: - Dictionary mapping column names to types - :raises click.ClickException: - If file cannot be loaded - """ - fmt = _detect_file_format(file_path) - compression = "gzip" if file_path.suffix == ".gz" else None - - try: - match fmt: - case "bed": - df = from_bed(str(file_path), compression=compression).to_duckdb(conn) - case "bam": - df = from_bam(str(file_path)).to_duckdb(conn) - case "vcf": - df = from_vcf(str(file_path), compression=compression).to_duckdb(conn) - case "gff": - df = from_gff(str(file_path), compression=compression).to_duckdb(conn) - case "gtf": - df = from_gtf(str(file_path), compression=compression).to_duckdb(conn) - case _: - raise click.ClickException(f"Unsupported format: {fmt}") - - conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df") - - # Get column information - col_info = conn.execute(f"DESCRIBE {table_name}").fetchall() - return {col[0]: col[1] for col in col_info} - - except Exception as e: - raise click.ClickException(f"Failed to load {file_path}: {e}") - - -def _expand_rest_columns(df): - """Expand 'rest' columns from BED files into separate columns. - - BED files store extra fields beyond chrom/start/end in a 'rest' column - as a tab-delimited string. This function expands those into separate columns - to match bedtools output format. - - :param df: - DataFrame with potential 'rest' columns - :return: - DataFrame with rest columns expanded - """ - import pandas as pd - - # pandas.read_sql can return duplicate column names when joining - # Find all 'rest' column positions - rest_indices = [i for i, col in enumerate(df.columns) if col == "rest"] - - if not rest_indices: - return df - - # Build new dataframe with expanded columns - # We need to handle duplicate column names, so we can't use a dict - new_data = {} - new_col_names = [] - - for i, col in enumerate(df.columns): - if col == "rest" and i in rest_indices: - # Expand this rest column - col_data = df.iloc[:, i] - expanded = col_data.fillna("").astype(str).str.split("\t", expand=True) - - # Add all expanded columns with unique names - for j in range(expanded.shape[1]): - col_name = f"field_{j + 4}" - # Make unique if duplicate - base_name = col_name - counter = 0 - while col_name in new_col_names: - counter += 1 - col_name = f"{base_name}_{counter}" - new_col_names.append(col_name) - new_data[col_name] = expanded[j] - else: - # Keep non-rest columns as-is - # Make unique names for duplicates - col_name = col - base_name = col_name - counter = 0 - while col_name in new_col_names: - counter += 1 - col_name = f"{base_name}_{counter}" - new_col_names.append(col_name) - new_data[col_name] = df.iloc[:, i] - - # Rebuild dataframe with explicit column order - result = pd.DataFrame(new_data, columns=new_col_names) - return result - - -def _detect_genomic_columns(columns: dict[str, str]) -> dict[str, str | None]: - """Detect genomic coordinate columns from available columns. - - :param columns: - Dictionary of column name -> type - :return: - Dictionary with keys: chrom_col, start_col, end_col, strand_col - """ - col_names = {c.lower(): c for c in columns.keys()} - - # Chromosome column patterns (in priority order) - chrom_col = None - for pattern in ["chrom", "seqid", "chr", "chromosome", "contig", "seqname"]: - if pattern in col_names: - chrom_col = col_names[pattern] - break - - # Start column patterns - start_col = None - for pattern in [ - "start", - "chromstart", - "pos", - "begin", - "txstart", - "cdsstart", - "thickstart", - ]: - if pattern in col_names: - start_col = col_names[pattern] - break - - # End column patterns - end_col = None - for pattern in [ - "end", - "chromend", - "stop", - "txend", - "cdsend", - "thickend", - ]: - if pattern in col_names: - end_col = col_names[pattern] - break - - # Strand column patterns - strand_col = None - for pattern in ["strand", "str", "orientation"]: - if pattern in col_names: - strand_col = col_names[pattern] - break - - return { - "chrom_col": chrom_col, - "start_col": start_col, - "end_col": end_col, - "strand_col": strand_col, - } - - -@cli.command() -@click.option( - "-a", - "--file-a", - required=True, - type=click.Path(exists=True), - help="BAM/BED/GFF/VCF file 'A'. Each feature in A is compared to B.", -) -@click.option( - "-b", - "--file-b", - required=True, - multiple=True, - type=click.Path(exists=True), - help="One or more BAM/BED/GFF/VCF files for comparison.", -) -@click.option( - "-wa", - "--write-a", - is_flag=True, - help="Write the original entry in A for each overlap.", -) -@click.option( - "-wb", - "--write-b", - is_flag=True, - help="Write the original entry in B for each overlap.", -) -@click.option( - "-loj", - "--left-outer-join", - is_flag=True, - help="Perform left outer join. Report all A features with NULL B when no overlap.", -) -@click.option( - "-wo", - "--write-overlap", - is_flag=True, - help="Write the number of overlapping base pairs between features.", -) -@click.option( - "-wao", - "--write-all-overlap", - is_flag=True, - help="Like -wo but includes A features with zero overlap.", -) -@click.option( - "-u", - "--unique", - is_flag=True, - help="Report each A feature only once if any overlap exists in B.", -) -@click.option( - "-c", - "--count", - is_flag=True, - help="For each entry in A, report the number of overlaps in B.", -) -@click.option( - "-v", - "--invert", - is_flag=True, - help="Only report entries in A that have no overlap in B.", -) -@click.option( - "-f", - "--fraction-a", - type=float, - help="Minimum overlap as fraction of A.", -) -@click.option( - "-F", - "--fraction-b", - type=float, - help="Minimum overlap as fraction of B.", -) -@click.option( - "-r", - "--reciprocal", - is_flag=True, - help="Require reciprocal overlap fraction for both A and B.", -) -@click.option( - "-e", - "--either", - is_flag=True, - help="Require that -f OR -F be satisfied (not both).", -) -@click.option( - "-s", - "--same-strand", - is_flag=True, - help="Require same strand for overlaps.", -) -@click.option( - "-S", - "--opposite-strand", - is_flag=True, - help="Require opposite strand for overlaps.", -) -@click.option( - "--header", - is_flag=True, - help="Print the header from A before results.", -) -@click.option( - "--names", - multiple=True, - help="Aliases for B files (instead of file numbers).", -) -@click.option( - "-sorted", - "--sorted-input", - is_flag=True, - help="For compatibility with bedtools (currently ignored).", -) -@click.option( - "--chunksize", - type=int, - help="Process results in chunks of N rows (streaming mode for large datasets).", -) -def intersect( - file_a, - file_b, - write_a, - write_b, - left_outer_join, - write_overlap, - write_all_overlap, - unique, - count, - invert, - fraction_a, - fraction_b, - reciprocal, - either, - same_strand, - opposite_strand, - header, - names, - sorted_input, - chunksize, -): - """Find overlaps between genomic features. - - Similar to bedtools intersect, this command finds overlapping intervals - between files A and B using GIQL's spatial operators. - - Supports BED, BAM, VCF, GFF, and GTF formats (gzip compressed or uncompressed). - """ - # Validate conflicting options - if same_strand and opposite_strand: - raise click.UsageError("Cannot use -s and -S together") - - output_modes = [ - write_a, - write_b, - left_outer_join, - write_overlap, - write_all_overlap, - unique, - count, - invert, - ] - if sum(output_modes) > 1: - raise click.UsageError("Can only specify one output mode") - - # Create DuckDB connection - conn = duckdb.connect() - - # Initialize engine with existing connection - engine = GIQLEngine(target_dialect="duckdb", connection=conn) - - try: - # Load file A - file_a_path = Path(file_a) - table_a = "file_a" - columns_a = _load_genomic_file(conn, file_a_path, table_a) - - # Detect genomic columns - genomic_cols_a = _detect_genomic_columns(columns_a) - - if not all( - [ - genomic_cols_a["chrom_col"], - genomic_cols_a["start_col"], - genomic_cols_a["end_col"], - ] - ): - raise click.ClickException( - f"Could not detect genomic columns in {file_a}. " - f"Found columns: {list(columns_a.keys())}" - ) - - # Register schema for file A - engine.register_table_schema( - table_a, - columns_a, - genomic_column="interval", - chrom_col=genomic_cols_a["chrom_col"], - start_col=genomic_cols_a["start_col"], - end_col=genomic_cols_a["end_col"], - strand_col=genomic_cols_a["strand_col"], - ) - - # Process file(s) B - results = [] - for idx, b_file in enumerate(file_b): - b_path = Path(b_file) - table_b = f"file_b_{idx}" - - # Load file B - columns_b = _load_genomic_file(conn, b_path, table_b) - - # Detect genomic columns in B - genomic_cols_b = _detect_genomic_columns(columns_b) - - if not all( - [ - genomic_cols_b["chrom_col"], - genomic_cols_b["start_col"], - genomic_cols_b["end_col"], - ] - ): - raise click.ClickException( - f"Could not detect genomic columns in {b_file}" - ) - - # Register schema for file B - engine.register_table_schema( - table_b, - columns_b, - genomic_column="region", - chrom_col=genomic_cols_b["chrom_col"], - start_col=genomic_cols_b["start_col"], - end_col=genomic_cols_b["end_col"], - strand_col=genomic_cols_b["strand_col"], - ) - - # Build query based on options - query = _build_intersect_query( - table_a=table_a, - table_b=table_b, - chrom_a=genomic_cols_a["chrom_col"], - start_a=genomic_cols_a["start_col"], - end_a=genomic_cols_a["end_col"], - strand_a=genomic_cols_a["strand_col"], - chrom_b=genomic_cols_b["chrom_col"], - start_b=genomic_cols_b["start_col"], - end_b=genomic_cols_b["end_col"], - strand_b=genomic_cols_b["strand_col"], - write_a=write_a, - write_b=write_b, - left_outer_join=left_outer_join, - write_overlap=write_overlap, - write_all_overlap=write_all_overlap, - unique=unique, - count=count, - invert=invert, - same_strand=same_strand, - opposite_strand=opposite_strand, - fraction_a=fraction_a, - fraction_b=fraction_b, - reciprocal=reciprocal, - either=either, - ) - - # Execute query and get cursor - cursor = engine.execute(query) - - # Get column names - col_names = [desc[0] for desc in cursor.description] - - # Output header if requested (only once, before first row) - if header and idx == 0: - print("\t".join(col_names)) - - # Stream results row by row - while True: - row = cursor.fetchone() - if row is None: - break - # Expand rest columns inline - output_fields = [] - for i, value in enumerate(row): - col_name = col_names[i] - if col_name == "rest" and value: - # Expand rest column - split on tabs - rest_fields = str(value).split("\t") - output_fields.extend(rest_fields) - else: - output_fields.append(str(value) if value is not None else "") - - # Add file identifier if needed - if names and idx < len(names): - output_fields.append(names[idx]) - elif len(file_b) > 1: - output_fields.append(b_path.name) - - # Output row as TSV - print("\t".join(output_fields)) - - finally: - engine.close() - - -def _build_intersect_query( - table_a: str, - table_b: str, - chrom_a: str, - start_a: str, - end_a: str, - strand_a: str | None, - chrom_b: str, - start_b: str, - end_b: str, - strand_b: str | None, - write_a: bool = False, - write_b: bool = False, - left_outer_join: bool = False, - write_overlap: bool = False, - write_all_overlap: bool = False, - unique: bool = False, - count: bool = False, - invert: bool = False, - same_strand: bool = False, - opposite_strand: bool = False, - fraction_a: float | None = None, - fraction_b: float | None = None, - reciprocal: bool = False, - either: bool = False, -) -> str: - """Build GIQL query based on intersect options.""" - - # Build strand filter if needed - strand_filter = "" - if same_strand and strand_a and strand_b: - strand_filter = f' AND a."{strand_a}" = b."{strand_b}"' - elif opposite_strand and strand_a and strand_b: - strand_filter = f' AND a."{strand_a}" != b."{strand_b}"' - - # Build fraction filter if needed - fraction_filter = "" - if fraction_a or fraction_b: - filters = [] - - if fraction_a: - # Overlap must be at least fraction_a of A's length - overlap_expr = ( - f'LEAST(a."{end_a}", b."{end_b}") - ' - f'GREATEST(a."{start_a}", b."{start_b}")' - ) - a_length = f'(a."{end_a}" - a."{start_a}")' - filters.append(f"({overlap_expr}::FLOAT / {a_length} >= {fraction_a})") - - if fraction_b: - # Overlap must be at least fraction_b of B's length - overlap_expr = ( - f'LEAST(a."{end_a}", b."{end_b}") - ' - f'GREATEST(a."{start_a}", b."{start_b}")' - ) - b_length = f'(b."{end_b}" - b."{start_b}")' - filters.append(f"({overlap_expr}::FLOAT / {b_length} >= {fraction_b})") - - # Combine filters based on reciprocal/either flags - if reciprocal and len(filters) == 2: - # Both must be satisfied (AND) - fraction_filter = f" AND ({filters[0]} AND {filters[1]})" - elif either and len(filters) == 2: - # Either must be satisfied (OR) - fraction_filter = f" AND ({filters[0]} OR {filters[1]})" - elif filters: - # Just one filter or default behavior - fraction_filter = f" AND {' AND '.join(filters)}" - - if invert: - # Only features in A with no overlap in B - where_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" - return f""" - SELECT a.* - FROM {table_a} a - WHERE NOT EXISTS ( - SELECT 1 FROM {table_b} b - WHERE {where_clause} - ) - """ - - if count: - # Count overlaps - # Get all columns from table A for GROUP BY - on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" - return f""" - SELECT a.*, COUNT(b.\"{chrom_b}\") as overlap_count - FROM {table_a} a - LEFT JOIN {table_b} b ON {on_clause} - GROUP BY ALL - """ - - if unique: - # Report each A feature only once if overlaps exist - on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" - return f""" - SELECT DISTINCT a.* - FROM {table_a} a - JOIN {table_b} b ON {on_clause} - """ - - if left_outer_join or write_all_overlap: - # Left outer join - join_type = "LEFT JOIN" - else: - join_type = "JOIN" - - # Build select clause - if write_a and not write_b: - select_clause = "a.*" - elif write_b and not write_a: - select_clause = "b.*" - else: - # Default: write both A and B - select_clause = "a.*, b.*" - - # Add overlap calculation if requested - if write_overlap or write_all_overlap: - # Calculate overlap size: min(end_a, end_b) - max(start_a, start_b) - overlap_expr = f""" - CASE - WHEN b.\"{chrom_b}\" IS NULL THEN 0 - ELSE GREATEST(0, - LEAST(a.\"{end_a}\", b.\"{end_b}\") - - GREATEST(a.\"{start_a}\", b.\"{start_b}\") - ) - END as overlap_bp - """ - select_clause = f"{select_clause}, {overlap_expr}" - - # Build ON clause - on_clause = f"a.interval INTERSECTS b.region{strand_filter}{fraction_filter}" - - # Build base query - query = f""" - SELECT {select_clause} - FROM {table_a} a - {join_type} {table_b} b ON {on_clause} - """ - - return query - - -if __name__ == "__main__": - cli() diff --git a/src/giql/engine.py b/src/giql/engine.py deleted file mode 100644 index b1c5087..0000000 --- a/src/giql/engine.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Multi-backend query engine for GIQL. - -This module provides the main query engine that supports multiple SQL databases -through transpilation of GIQL syntax to standard SQL. -""" - -from typing import Literal - -import pandas as pd -from sqlglot import parse_one - -from giql.constants import DEFAULT_CHROM_COL -from giql.constants import DEFAULT_END_COL -from giql.constants import DEFAULT_GENOMIC_COL -from giql.constants import DEFAULT_START_COL -from giql.constants import DEFAULT_STRAND_COL -from giql.dialect import GIQLDialect -from giql.generators import BaseGIQLGenerator -from giql.generators import GIQLDuckDBGenerator -from giql.protocols import CursorLike -from giql.range_parser import CoordinateSystem -from giql.range_parser import IntervalType -from giql.schema import ColumnInfo -from giql.schema import SchemaInfo -from giql.schema import TableSchema -from giql.transformer import ClusterTransformer -from giql.transformer import MergeTransformer - -DialectType = Literal["duckdb", "sqlite"] - - -class GIQLEngine: - """Multi-backend GIQL query engine. - - Supports multiple SQL databases through transpilation of GIQL syntax - to standard SQL. Can work with DuckDB, SQLite, and other backends. - - Examples - -------- - Query a pandas DataFrame with DuckDB:: - - import pandas as pd - from giql import GIQLEngine - - df = pd.DataFrame( - { - "id": [1, 2, 3], - "chromosome": ["chr1", "chr1", "chr2"], - "start_pos": [1500, 10500, 500], - "end_pos": [1600, 10600, 600], - } - ) - with GIQLEngine(target_dialect="duckdb") as engine: - engine.conn.register("variants", df) - cursor = engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - for row in cursor: - print(row) - - Load from CSV:: - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", "variants.csv") - cursor = engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - # Process rows lazily - while True: - row = cursor.fetchone() - if row is None: - break - print(row) - - Using SQLite backend:: - - with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine: - cursor = engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - # Materialize all results at once - results = cursor.fetchall() - """ - - def __init__( - self, - target_dialect: DialectType | str = "duckdb", - connection=None, - db_path: str = ":memory:", - verbose: bool = False, - **dialect_options, - ): - """Initialize engine. - - :param target_dialect: - Target SQL dialect ('duckdb', 'sqlite', 'standard') - :param connection: - Existing database connection (optional) - :param db_path: - Database path or connection string - :param verbose: - Print transpiled SQL - :param dialect_options: - Additional options for specific dialects - """ - self.target_dialect = target_dialect - self.verbose = verbose - self.schema_info = SchemaInfo() - self.dialect_options = dialect_options - - # Initialize connection - if connection: - self.conn = connection - self.owns_connection = False - else: - self.conn = self._create_connection(db_path) - self.owns_connection = True - - # Get appropriate generator - self.generator = self._get_generator() - - # Initialize query transformers - self.cluster_transformer = ClusterTransformer(self.schema_info) - self.merge_transformer = MergeTransformer(self.schema_info) - - def _create_connection(self, db_path: str): - """Create database connection based on target dialect. - - :param db_path: - Path to database file or connection string - :return: - Connection object for the specified database backend - :raises ImportError: - If the required database driver is not installed - :raises ValueError: - If the dialect is not supported - """ - if self.target_dialect == "duckdb": - try: - import duckdb - - return duckdb.connect(db_path) - except ImportError: - raise ImportError("DuckDB not installed.") - - elif self.target_dialect == "sqlite": - import sqlite3 - - return sqlite3.connect(db_path) - - else: - raise ValueError( - f"Unsupported dialect: {self.target_dialect}. Supported: duckdb, sqlite" - ) - - def _get_generator(self): - """Get generator for target dialect. - - :return: - SQL generator instance configured for the target dialect - """ - generators = { - "duckdb": GIQLDuckDBGenerator, - "sqlite": BaseGIQLGenerator, - "standard": BaseGIQLGenerator, - } - - generator_class = generators.get(self.target_dialect, BaseGIQLGenerator) - return generator_class(schema_info=self.schema_info, **self.dialect_options) - - def register_table_schema( - self, - table_name: str, - columns: dict[str, str], - genomic_column: str = DEFAULT_GENOMIC_COL, - chrom_col: str = DEFAULT_CHROM_COL, - start_col: str = DEFAULT_START_COL, - end_col: str = DEFAULT_END_COL, - strand_col: str | None = DEFAULT_STRAND_COL, - coordinate_system: str = "0based", - interval_type: str = "half_open", - ): - """Register schema for a table. - - This method tells the engine how genomic ranges are stored in the table, - mapping logical genomic column names to physical column names. - - :param table_name: - Table name - :param columns: - Dict of column_name -> type - :param genomic_column: - Logical name for genomic position - :param chrom_col: - Physical chromosome column - :param start_col: - Physical start position column - :param end_col: - Physical end position column - :param strand_col: - Physical strand column (optional) - :param coordinate_system: - Coordinate system: "0based" or "1based" (default: "0based") - :param interval_type: - Interval endpoint handling: "half_open" or "closed" (default: "half_open") - """ - # Convert string parameters to enums - coord_sys = ( - CoordinateSystem.ONE_BASED - if coordinate_system == "1based" - else CoordinateSystem.ZERO_BASED - ) - int_type = ( - IntervalType.CLOSED if interval_type == "closed" else IntervalType.HALF_OPEN - ) - - column_infos = {} - - for col_name, col_type in columns.items(): - column_infos[col_name] = ColumnInfo( - name=col_name, type=col_type, is_genomic=False - ) - - # Add virtual genomic column with mappings to physical columns - column_infos[genomic_column] = ColumnInfo( - name=genomic_column, - type="GENOMIC_RANGE", # Virtual type - is_genomic=True, - chrom_col=chrom_col, - start_col=start_col, - end_col=end_col, - strand_col=strand_col, - coordinate_system=coord_sys, - interval_type=int_type, - ) - - table_schema = TableSchema(table_name, column_infos) - self.schema_info.register_table(table_name, table_schema) - - def load_csv(self, table_name: str, file_path: str): - """Load CSV file into database. - - :param table_name: - Name to assign to the table - :param file_path: - Path to the CSV file - """ - if self.target_dialect == "duckdb": - self.conn.execute( - f"CREATE TABLE {table_name} " - f"AS SELECT * FROM read_csv_auto('{file_path}')" - ) - elif self.target_dialect == "sqlite": - # Use pandas for SQLite - df = pd.read_csv(file_path) - df.to_sql(table_name, self.conn, if_exists="replace", index=False) - - if self.verbose: - print(f"Loaded {table_name} from {file_path}") - - def load_parquet(self, table_name: str, file_path: str): - """Load Parquet file into database. - - :param table_name: - Name to assign to the table - :param file_path: - Path to the Parquet file - """ - if self.target_dialect == "duckdb": - self.conn.execute( - f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')" - ) - else: - df = pd.read_parquet(file_path) - df.to_sql(table_name, self.conn, if_exists="replace", index=False) - - if self.verbose: - print(f"Loaded {table_name} from {file_path}") - - def transpile(self, giql: str) -> str: - """Transpile a GIQL query to the engine's target SQL dialect. - - Parses the GIQL syntax and transpiles it to the target SQL dialect - without executing it. Useful for debugging or generating SQL for - external use. - - :param giql: - Query string with GIQL genomic extensions - :return: - Transpiled SQL query string in the target dialect - :raises ValueError: - If the query cannot be parsed or transpiled - """ - # Parse with GIQL dialect - try: - ast = parse_one(giql, dialect=GIQLDialect) - except Exception as e: - raise ValueError(f"Parse error: {e}\nQuery: {giql}") - - # Transform query (MERGE first, then CLUSTER) - try: - # Apply MERGE transformation (which may internally use CLUSTER) - ast = self.merge_transformer.transform(ast) - # Apply CLUSTER transformation for any standalone CLUSTER expressions - ast = self.cluster_transformer.transform(ast) - except Exception as e: - raise ValueError(f"Transformation error: {e}") - - # Transpile to target dialect - try: - target_sql = self.generator.generate(ast) - except Exception as e: - raise ValueError(f"Transpilation error: {e}") - - if self.verbose: - print(f"\n{'=' * 60}") - print(f"Target Dialect: {self.target_dialect}") - print("\nOriginal GIQL:") - print(giql) - print("\nTranspiled SQL:") - print(target_sql) - print(f"{'=' * 60}\n") - - return target_sql - - def execute(self, giql: str) -> CursorLike: - """Execute a GIQL query and return a database cursor. - - Parses the GIQL syntax, transpiles to target SQL dialect, - and executes the query returning a cursor for lazy iteration. - - :param giql: - Query string with GIQL genomic extensions - :return: - Database cursor (DB-API 2.0 compatible) that can be iterated - :raises ValueError: - If the query cannot be parsed, transpiled, or executed - """ - # Transpile GIQL to target SQL - target_sql = self.transpile(giql) - - # Execute and return cursor - try: - return self.conn.execute(target_sql) - except Exception as e: - raise ValueError(f"Execution error: {e}\nSQL: {target_sql}") - - def execute_raw(self, sql: str) -> pd.DataFrame: - """Execute raw SQL directly, bypassing GIQL parsing. - - :param sql: - Raw SQL query string - :return: - Query results as a pandas DataFrame - """ - return pd.read_sql(sql, self.conn) - - def close(self): - """Close database connection. - - Only closes connections created by the engine. If an external - connection was provided during initialization, it is not closed. - """ - if self.owns_connection and self.conn: - self.conn.close() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): # noqa: ANN001 - self.close() diff --git a/src/giql/generators/__init__.py b/src/giql/generators/__init__.py index b04bd93..ca8cb16 100644 --- a/src/giql/generators/__init__.py +++ b/src/giql/generators/__init__.py @@ -1,9 +1,5 @@ -""" -SQL generators for different database dialects. -""" +"""SQL generators for GIQL transpilation.""" from giql.generators.base import BaseGIQLGenerator -from giql.generators.duckdb import GIQLDuckDBGenerator -from giql.generators.sqlite import GIQLSQLiteGenerator -__all__ = ["BaseGIQLGenerator", "GIQLDuckDBGenerator", "GIQLSQLiteGenerator"] +__all__ = ["BaseGIQLGenerator"] diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py index 2313821..316cd00 100644 --- a/src/giql/generators/base.py +++ b/src/giql/generators/base.py @@ -15,7 +15,7 @@ from giql.expressions import Within from giql.range_parser import ParsedRange from giql.range_parser import RangeParser -from giql.schema import SchemaInfo +from giql.table import Tables class BaseGIQLGenerator(Generator): @@ -48,9 +48,9 @@ def _extract_bool_param(param_expr: Optional[exp.Expression]) -> bool: else: return str(param_expr).upper() in ("TRUE", "1", "YES") - def __init__(self, schema_info: Optional[SchemaInfo] = None, **kwargs): + def __init__(self, tables: Optional[Tables] = None, **kwargs): super().__init__(**kwargs) - self.schema_info = schema_info or SchemaInfo() + self.tables = tables or Tables() self._current_table = None # Track current table for column resolution self._alias_to_table = {} # Map aliases to table names @@ -187,40 +187,26 @@ def giqlnearest_sql(self, expression: GIQLNearest) -> str: else: # Implicit reference in correlated mode - get strand from outer table outer_table = self._find_outer_table_in_lateral_join(expression) - if outer_table and self.schema_info: + if outer_table and self.tables: actual_table = self._alias_to_table.get(outer_table, outer_table) - table_schema = self.schema_info.get_table(actual_table) - if table_schema: - for col_info in table_schema.columns.values(): - if col_info.is_genomic and col_info.strand_col: - ref_strand = f'{outer_table}."{col_info.strand_col}"' - break + table = self.tables.get(actual_table) + if table and table.strand_col: + ref_strand = f'{outer_table}."{table.strand_col}"' # Get strand column for target table - target_table_info = ( - self.schema_info.get_table(table_name) if self.schema_info else None - ) - if target_table_info: - for col_info in target_table_info.columns.values(): - if col_info.is_genomic and col_info.strand_col: - target_strand = f'{table_name}."{col_info.strand_col}"' - break + target_table = self.tables.get(table_name) if self.tables else None + if target_table and target_table.strand_col: + target_strand = f'{table_name}."{target_table.strand_col}"' # Determine if we should add 1 for gap distances (bedtools compatibility) # This depends on the interval types of the tables involved add_one = False - if self.schema_info: - target_table_info = self.schema_info.get_table(table_name) - if target_table_info: - for col_info in target_table_info.columns.values(): - if col_info.is_genomic: - # Import IntervalType to check - from giql.range_parser import IntervalType - - # Add 1 for closed intervals (bedtools behavior) - if col_info.interval_type == IntervalType.CLOSED: - add_one = True - break + if self.tables: + target_table = self.tables.get(table_name) + if target_table: + # Add 1 for closed intervals (bedtools behavior) + if target_table.interval_type == "closed": + add_one = True # Build distance calculation using CASE expression # For NEAREST: ORDER BY absolute distance, but RETURN signed distance @@ -338,30 +324,25 @@ def giqldistance_sql(self, expression: GIQLDistance) -> str: raise ValueError("Literal range as second argument not yet supported") # Determine if we should add 1 for gap distances (bedtools compatibility) - # Check interval types from schema + # Check interval types from table config add_one = False - if self.schema_info: + if self.tables: # Extract table names from column references # Column refs look like "table.column" or "alias.column" table_a = interval_a_sql.split(".")[0] if "." in interval_a_sql else None table_b = interval_b_sql.split(".")[0] if "." in interval_b_sql else None # Check if either table uses closed intervals - from giql.range_parser import IntervalType - - for table_name in [table_a, table_b]: - if table_name: + for tbl_name in [table_a, table_b]: + if tbl_name: # Remove quotes if present - table_name = table_name.strip('"') + tbl_name = tbl_name.strip('"') # Check if it's an alias first - actual_table = self._alias_to_table.get(table_name, table_name) - table_info = self.schema_info.get_table(actual_table) - if table_info: - for col_info in table_info.columns.values(): - if col_info.is_genomic: - if col_info.interval_type == IntervalType.CLOSED: - add_one = True - break + actual_table = self._alias_to_table.get(tbl_name, tbl_name) + table = self.tables.get(actual_table) + if table and table.interval_type == "closed": + add_one = True + break # Generate CASE expression return self._generate_distance_case( @@ -776,32 +757,19 @@ def _resolve_nearest_reference( "Please specify reference parameter explicitly." ) - # Look up the table's schema to find the genomic column + # Look up the table to find the genomic column # Check if outer_table is an alias actual_table = self._alias_to_table.get(outer_table, outer_table) - table_schema = self.schema_info.get_table(actual_table) - - if not table_schema: - raise ValueError( - f"Outer table '{outer_table}' not found in schema. " - "Please specify reference parameter explicitly." - ) + table = self.tables.get(actual_table) - # Find the genomic column in the table schema - genomic_col_name = None - for col_info in table_schema.columns.values(): - if col_info.is_genomic: - genomic_col_name = col_info.name - break - - if not genomic_col_name: + if not table: raise ValueError( - f"No genomic column found in table '{outer_table}'. " + f"Outer table '{outer_table}' not found in tables. " "Please specify reference parameter explicitly." ) # Build column references using the outer table and genomic column - reference_sql = f"{outer_table}.{genomic_col_name}" + reference_sql = f"{outer_table}.{table.genomic_col}" return self._get_column_refs(reference_sql, None) def _resolve_target_table( @@ -828,31 +796,15 @@ def _resolve_target_table( # Try to extract as string table_name = str(target) - table_schema = self.schema_info.get_table(table_name) - if not table_schema: + table = self.tables.get(table_name) + if not table: raise ValueError( - f"Target table '{table_name}' not found in schema. " - f"Available tables: {list(self.schema_info.tables.keys())}" + f"Target table '{table_name}' not found in tables. " + "Register the table before transpiling." ) - # Find genomic column in target table - genomic_col = None - for col_info in table_schema.columns.values(): - if col_info.is_genomic: - genomic_col = col_info - break - - if not genomic_col: - raise ValueError( - f"Target table '{table_name}' does not have a genomic column" - ) - - # Get physical column names - chrom_col = genomic_col.chrom_col or DEFAULT_CHROM_COL - start_col = genomic_col.start_col or DEFAULT_START_COL - end_col = genomic_col.end_col or DEFAULT_END_COL - - return table_name, (chrom_col, start_col, end_col) + # Get physical column names from table config + return table_name, (table.chrom_col, table.start_col, table.end_col) def _get_column_refs( self, @@ -887,22 +839,15 @@ def _get_column_refs( # Look up actual table name from alias table_name = self._alias_to_table.get(table_alias, self._current_table) - # Try to get custom column names from schema - if table_name and self.schema_info: - table_schema = self.schema_info.get_table(table_name) - if table_schema: - # Find the genomic column - for col_info in table_schema.columns.values(): - if col_info.is_genomic: - if col_info.chrom_col: - chrom_col = col_info.chrom_col - if col_info.start_col: - start_col = col_info.start_col - if col_info.end_col: - end_col = col_info.end_col - if col_info.strand_col: - strand_col = col_info.strand_col - break + # Try to get custom column names from table config + if table_name and self.tables: + table = self.tables.get(table_name) + if table: + chrom_col = table.chrom_col + start_col = table.start_col + end_col = table.end_col + if table.strand_col: + strand_col = table.strand_col # Format with table alias if present if table_alias: diff --git a/src/giql/generators/duckdb.py b/src/giql/generators/duckdb.py deleted file mode 100644 index bbe5a64..0000000 --- a/src/giql/generators/duckdb.py +++ /dev/null @@ -1,7 +0,0 @@ -from sqlglot.dialects.duckdb import DuckDB - -from giql.generators.base import BaseGIQLGenerator - - -class GIQLDuckDBGenerator(BaseGIQLGenerator, DuckDB.Generator): - """DuckDB-specific generator with optimizations.""" diff --git a/src/giql/generators/sqlite.py b/src/giql/generators/sqlite.py deleted file mode 100644 index 922e459..0000000 --- a/src/giql/generators/sqlite.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Final - -from sqlglot.dialects.sqlite import SQLite - -from giql.generators.base import BaseGIQLGenerator - - -class GIQLSQLiteGenerator(BaseGIQLGenerator, SQLite.Generator): - """SQLite-specific SQL generator. - - SQLite does not support LATERAL joins, so correlated NEAREST queries - (without explicit reference) will raise an error. Use standalone mode - with an explicit reference parameter instead. - - Example:: - - -- This works (standalone mode with explicit reference): - SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3) - - -- This fails (correlated mode requires LATERAL): - SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3) - """ - - SUPPORTS_LATERAL: Final = False diff --git a/src/giql/protocols.py b/src/giql/protocols.py deleted file mode 100644 index 9002051..0000000 --- a/src/giql/protocols.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Protocol definitions for GIQL. - -This module defines protocols for type checking and interface compatibility. -""" - -from typing import Any -from typing import Protocol -from typing import Sequence - - -class CursorLike(Protocol): - """Protocol for DB-API 2.0 compatible cursors. - - Based on PEP 249: https://peps.python.org/pep-0249/ - - This protocol defines the minimal interface required for database cursors - that can be used with GIQL. All DB-API 2.0 compliant drivers (SQLite, - PostgreSQL, MySQL, DuckDB) implement this interface. - """ - - @property - def description( - self, - ) -> ( - Sequence[ - tuple[str, Any, Any | None, Any | None, Any | None, Any | None, Any | None] - ] - | None - ): - """Column descriptions. - - A sequence of 7-tuples describing each column: - (name, type_code, display_size, internal_size, precision, scale, null_ok) - - Only 'name' is required; other values may be None. - Returns None if no operation has been performed yet. - """ - ... - - @property - def rowcount(self) -> int: - """Number of rows affected by last operation. - - Returns -1 if no operation has been performed or if the count - cannot be determined. - """ - ... - - def fetchone(self) -> tuple[Any, ...] | None: - """Fetch the next row of a query result set. - - Returns a tuple representing the next row, or None when no more - rows are available. - """ - ... - - def fetchmany(self, size: int = 1) -> list[tuple[Any, ...]]: - """Fetch the next set of rows of a query result set. - - Returns a list of tuples. An empty list is returned when no more - rows are available. - - :param size: - Number of rows to fetch (default: 1) - """ - ... - - def fetchall(self) -> list[tuple[Any, ...]]: - """Fetch all remaining rows of a query result set. - - Returns a list of tuples. An empty list is returned when no rows - are available. - """ - ... - - def close(self) -> None: - """Close the cursor. - - Makes the cursor unusable for further operations. - """ - ... diff --git a/src/giql/schema.py b/src/giql/schema.py deleted file mode 100644 index 1b6e0d5..0000000 --- a/src/giql/schema.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Schema information for transpilation. - -This module manages schema metadata for tables, including how genomic -ranges are physically stored in the database. -""" - -from dataclasses import dataclass -from typing import Dict -from typing import Optional - -from giql.range_parser import CoordinateSystem -from giql.range_parser import IntervalType - - -@dataclass -class ColumnInfo: - """Information about a column.""" - - name: str - type: str - is_genomic: bool = False - # For genomic columns stored as separate fields - chrom_col: Optional[str] = None - start_col: Optional[str] = None - end_col: Optional[str] = None - strand_col: Optional[str] = None - # Coordinate system configuration for genomic columns - coordinate_system: CoordinateSystem = CoordinateSystem.ZERO_BASED - interval_type: IntervalType = IntervalType.HALF_OPEN - - -@dataclass -class TableSchema: - """Schema for a table.""" - - name: str - columns: Dict[str, ColumnInfo] - - -class SchemaInfo: - """Manages schema information for transpilation. - - Tracks how genomic ranges are stored: - - Separate columns (chromosome, start_pos, end_pos) - - STRUCT types - - Custom types - """ - - def __init__(self): - self.tables: Dict[str, TableSchema] = {} - - def register_table(self, name: str, schema: TableSchema): - """Register a table schema. - - :param name: Table name - :param schema: TableSchema object - """ - self.tables[name] = schema - - def get_table(self, name: str) -> Optional[TableSchema]: - """Get table schema by name. - - :param name: - Table name - :return: - TableSchema object or None if not found - """ - return self.tables.get(name) - - def get_column_info(self, table: str, column: str) -> Optional[ColumnInfo]: - """Get column information. - - :param table: - Table name - :param column: - Column name - :return: - ColumnInfo object or None if not found - """ - table_schema = self.get_table(table) - if table_schema: - return table_schema.columns.get(column) - return None diff --git a/src/giql/table.py b/src/giql/table.py new file mode 100644 index 0000000..6899bf3 --- /dev/null +++ b/src/giql/table.py @@ -0,0 +1,131 @@ +"""Table configuration for GIQL transpilation. + +This module defines the Table dataclass for configuring genomic table schemas. +""" + +from dataclasses import dataclass +from typing import Literal + +from giql.constants import DEFAULT_CHROM_COL +from giql.constants import DEFAULT_END_COL +from giql.constants import DEFAULT_GENOMIC_COL +from giql.constants import DEFAULT_START_COL +from giql.constants import DEFAULT_STRAND_COL + + +@dataclass +class Table: + """Genomic table configuration for transpilation. + + This class defines how genomic intervals are stored in a database table, + mapping a pseudo-column name (genomic_col) to the physical columns that + store chromosome, start, end, and optionally strand information. + + Parameters + ---------- + genomic_col : str + The pseudo-column name used in GIQL queries to reference the genomic + interval (default: "interval"). + chrom_col : str + The physical column name storing chromosome/contig (default: "chromosome"). + start_col : str + The physical column name storing interval start position + (default: "start_pos"). + end_col : str + The physical column name storing interval end position + (default: "end_pos"). + strand_col : str | None + The physical column name storing strand information, or None if the + table has no strand column (default: "strand"). + coordinate_system : Literal["0based", "1based"] + The coordinate system used for positions (default: "0based"). + interval_type : Literal["half_open", "closed"] + The interval endpoint convention (default: "half_open"). + + Examples + -------- + Using default column names (via transpile):: + + sql = transpile(query, tables=["peaks"]) + + Using custom column names:: + + sql = transpile( + query, + tables={ + "variants": Table( + genomic_col="position", + chrom_col="chr", + start_col="pos_start", + end_col="pos_end", + strand_col=None, # No strand column + coordinate_system="1based", + interval_type="closed", + ) + } + ) + """ + + genomic_col: str = DEFAULT_GENOMIC_COL + chrom_col: str = DEFAULT_CHROM_COL + start_col: str = DEFAULT_START_COL + end_col: str = DEFAULT_END_COL + strand_col: str | None = DEFAULT_STRAND_COL + coordinate_system: Literal["0based", "1based"] = "0based" + interval_type: Literal["half_open", "closed"] = "half_open" + + def __post_init__(self) -> None: + """Validate field values after initialization.""" + if self.coordinate_system not in ("0based", "1based"): + raise ValueError( + f"coordinate_system must be '0based' or '1based', " + f"got {self.coordinate_system!r}" + ) + if self.interval_type not in ("half_open", "closed"): + raise ValueError( + f"interval_type must be 'half_open' or 'closed', " + f"got {self.interval_type!r}" + ) + + +class Tables: + """Container for Table configurations. + + Provides lookup of Table objects by name for use during transpilation. + """ + + def __init__(self) -> None: + self._tables: dict[str, Table] = {} + + def register(self, name: str, table: Table) -> None: + """Register a table configuration. + + Parameters + ---------- + name : str + The table name to register. + table : Table + Table configuration to register. + """ + self._tables[name] = table + + def get(self, name: str) -> Table | None: + """Get a table configuration by name. + + Parameters + ---------- + name : str + Table name to look up. + + Returns + ------- + Table | None + Table configuration if found, None otherwise. + """ + return self._tables.get(name) + + def __contains__(self, name: str) -> bool: + return name in self._tables + + def __iter__(self): + return iter(self._tables.values()) diff --git a/src/giql/transformer.py b/src/giql/transformer.py index 2d9705f..de1e70f 100644 --- a/src/giql/transformer.py +++ b/src/giql/transformer.py @@ -12,7 +12,7 @@ from giql.constants import DEFAULT_STRAND_COL from giql.expressions import GIQLCluster from giql.expressions import GIQLMerge -from giql.schema import SchemaInfo +from giql.table import Tables class ClusterTransformer: @@ -32,13 +32,13 @@ class ClusterTransformer: FROM lag_calc """ - def __init__(self, schema_info: SchemaInfo): + def __init__(self, tables: Tables): """Initialize transformer. - :param schema_info: - Schema information for column mapping + :param tables: + Table configurations for column mapping """ - self.schema_info = schema_info + self.tables = tables def _get_table_name(self, query: exp.Select) -> str | None: """Extract table name from query's FROM clause. @@ -58,7 +58,7 @@ def _get_table_name(self, query: exp.Select) -> str | None: return None def _get_genomic_columns(self, query: exp.Select) -> tuple[str, str, str, str]: - """Get genomic column names from schema info or defaults. + """Get genomic column names from table config or defaults. :param query: Query to extract table and column info from @@ -74,20 +74,13 @@ def _get_genomic_columns(self, query: exp.Select) -> tuple[str, str, str, str]: strand_col = DEFAULT_STRAND_COL if table_name: - table_schema = self.schema_info.get_table(table_name) - if table_schema: - # Find the genomic column - for col_info in table_schema.columns.values(): - if col_info.is_genomic: - if col_info.chrom_col: - chrom_col = col_info.chrom_col - if col_info.start_col: - start_col = col_info.start_col - if col_info.end_col: - end_col = col_info.end_col - if col_info.strand_col: - strand_col = col_info.strand_col - break + table = self.tables.get(table_name) + if table: + chrom_col = table.chrom_col + start_col = table.start_col + end_col = table.end_col + if table.strand_col: + strand_col = table.strand_col return chrom_col, start_col, end_col, strand_col @@ -209,7 +202,7 @@ def _transform_for_cluster( else: stranded = False - # Get column names from schema_info or use defaults + # Get column names from table config or use defaults chrom_col, start_col, end_col, strand_col = self._get_genomic_columns(query) # Build partition clause @@ -366,14 +359,14 @@ class MergeTransformer: ORDER BY chromosome, start_pos """ - def __init__(self, schema_info: SchemaInfo): + def __init__(self, tables: Tables): """Initialize transformer. - :param schema_info: - Schema information for column mapping + :param tables: + Table configurations for column mapping """ - self.schema_info = schema_info - self.cluster_transformer = ClusterTransformer(schema_info) + self.tables = tables + self.cluster_transformer = ClusterTransformer(tables) def transform(self, query: exp.Expression) -> exp.Expression: """Transform query if it contains MERGE expressions. @@ -468,7 +461,7 @@ def _transform_for_merge( distance_expr = merge_expr.args.get("distance") stranded_expr = merge_expr.args.get("stranded") - # Get column names from schema_info or use defaults + # Get column names from table config or use defaults ( chrom_col, start_col, diff --git a/src/giql/transpile.py b/src/giql/transpile.py new file mode 100644 index 0000000..5271d06 --- /dev/null +++ b/src/giql/transpile.py @@ -0,0 +1,129 @@ +"""Transpile GIQL queries to SQL. + +This module provides the main entry point for transpiling GIQL queries +to standard SQL. +""" + +from sqlglot import parse_one + +from giql.dialect import GIQLDialect +from giql.generators import BaseGIQLGenerator +from giql.table import Table +from giql.table import Tables +from giql.transformer import ClusterTransformer +from giql.transformer import MergeTransformer + + +def _build_tables(tables: list[str] | dict[str, Table] | None) -> Tables: + """Build a Tables container from table specifications. + + Parameters + ---------- + tables : list[str] | dict[str, Table] | None + Table specifications. Strings use default column mappings. + Dict maps table names to Table configurations. + + Returns + ------- + Tables + Container with all tables registered. + """ + container = Tables() + + if tables is None: + return container + + if isinstance(tables, dict): + for name, table in tables.items(): + container.register(name, table) + else: + for name in tables: + container.register(name, Table()) + + return container + + +def transpile( + giql: str, + tables: list[str] | dict[str, Table] | None = None, +) -> str: + """Transpile a GIQL query to SQL. + + Parses the GIQL syntax and converts it to standard SQL-92 compatible + output (uses LATERAL joins where needed for operations like NEAREST). + + Parameters + ---------- + giql : str + The GIQL query string containing genomic extensions like + INTERSECTS, CONTAINS, WITHIN, CLUSTER, MERGE, or NEAREST. + tables : list[str] | dict[str, Table] | None + Table configurations. A list of strings uses default column mappings + (chromosome, start_pos, end_pos, strand). A dict maps table names + to Table objects for custom column name mappings. + + Returns + ------- + str + The transpiled SQL query. + + Raises + ------ + ValueError + If the query cannot be parsed or transpiled. + + Examples + -------- + Basic usage with default column mappings:: + + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"] + ) + + Custom table configuration:: + + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables={ + "peaks": Table( + genomic_col="interval", + chrom_col="chrom", + start_col="start", + end_col="end", + ) + } + ) + """ + # Build tables container + tables_container = _build_tables(tables) + + # Initialize transformers with table configurations + merge_transformer = MergeTransformer(tables_container) + cluster_transformer = ClusterTransformer(tables_container) + + # Initialize generator with table configurations + generator = BaseGIQLGenerator(tables=tables_container) + + # Parse GIQL query + try: + ast = parse_one(giql, dialect=GIQLDialect) + except Exception as e: + raise ValueError(f"Parse error: {e}\nQuery: {giql}") from e + + # Apply transformations (MERGE first, then CLUSTER) + try: + # MERGE transformation (which may internally use CLUSTER) + ast = merge_transformer.transform(ast) + # CLUSTER transformation for any standalone CLUSTER expressions + ast = cluster_transformer.transform(ast) + except Exception as e: + raise ValueError(f"Transformation error: {e}") from e + + # Generate SQL + try: + sql = generator.generate(ast) + except Exception as e: + raise ValueError(f"Transpilation error: {e}") from e + + return sql diff --git a/tests/conftest.py b/tests/conftest.py index 36b4f05..2ddb618 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,22 +1,18 @@ -""" -Pytest fixtures for integration tests. -""" +"""Pytest fixtures for GIQL tests.""" import pandas as pd import pytest -from giql import GIQLEngine - @pytest.fixture(scope="session") def to_df(): - """Fixture providing a helper to convert cursors to DataFrames. + """Fixture providing a helper to convert DuckDB results to DataFrames. - Returns a function that materializes cursor results for testing. + Returns a function that materializes query results for testing. Session-scoped since it's a pure function with no state. Usage: - result = to_df(engine.execute("SELECT ...")) + result = to_df(conn.execute("SELECT ...")) """ def _to_df(cursor): @@ -26,155 +22,3 @@ def _to_df(cursor): return pd.DataFrame() return _to_df - - -@pytest.fixture -def sample_variants_csv(tmp_path): - """Create sample variants CSV.""" - csv_content = """ - id,chromosome,start_pos,end_pos,ref,alt,quality - 1,chr1,1500,1600,A,T,30.0 - 2,chr1,10500,10600,G,C,40.0 - 3,chr1,15000,15100,T,A,25.0 - 4,chr2,500,600,C,G,35.0 - 5,chr2,5500,5600,A,T,20.0 - 6,chr1,25000,25100,G,A,35.0 - 7,chr2,15000,15100,T,C,28.0 - 8,chr3,1000,1100,A,G,32.0 - """ - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content.strip()) - return str(csv_path) - - -@pytest.fixture -def sample_genes_csv(tmp_path): - """Create sample genes CSV.""" - csv_content = """ - gene_id,name,chromosome,start_pos,end_pos,strand - 1,GENE1,chr1,1000,2000,+ - 2,GENE2,chr1,10000,11000,- - 3,GENE3,chr1,14000,16000,+ - 4,GENE4,chr2,400,700,+ - 5,GENE5,chr2,5000,6000,- - """ - csv_path = tmp_path / "genes.csv" - csv_path.write_text(csv_content.strip()) - return str(csv_path) - - -@pytest.fixture(params=["duckdb", "sqlite"]) -def engine_with_variants(request, sample_variants_csv): - """Create engine with loaded variants data for different dialects.""" - dialect = request.param - - engine = GIQLEngine(target_dialect=dialect, verbose=False) - engine.load_csv("variants", sample_variants_csv) - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "ref": "VARCHAR", - "alt": "VARCHAR", - "quality": "FLOAT", - }, - genomic_column="interval", - ) - - yield engine - engine.close() - - -@pytest.fixture -def duckdb_engine_with_data(sample_variants_csv, sample_genes_csv): - """DuckDB engine with both variants and genes loaded.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=False) - engine.load_csv("variants", sample_variants_csv) - engine.load_csv("genes", sample_genes_csv) - - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "ref": "VARCHAR", - "alt": "VARCHAR", - "quality": "FLOAT", - }, - genomic_column="interval", - ) - - engine.register_table_schema( - "genes", - { - "gene_id": "INTEGER", - "name": "VARCHAR", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", - }, - genomic_column="interval", - ) - - yield engine - engine.close() - - -@pytest.fixture -def sample_peaks_csv(tmp_path): - """Create sample ChIP-seq peaks CSV for NEAREST testing.""" - csv_content = """ - peak_id,chromosome,start_pos,end_pos,signal - 1,chr1,5000,5200,100.5 - 2,chr1,12000,12100,85.2 - 3,chr1,20000,20500,120.8 - 4,chr2,3000,3100,95.3 - 5,chr2,8000,8200,110.7 - """ - csv_path = tmp_path / "peaks.csv" - csv_path.write_text(csv_content.strip()) - return str(csv_path) - - -@pytest.fixture -def engine_with_peaks_and_genes(request, sample_peaks_csv, sample_genes_csv): - """Create engine with peaks and genes loaded for NEAREST testing.""" - dialect = request.param if hasattr(request, "param") else "duckdb" - - engine = GIQLEngine(target_dialect=dialect, verbose=False) - engine.load_csv("peaks", sample_peaks_csv) - engine.load_csv("genes", sample_genes_csv) - - engine.register_table_schema( - "peaks", - { - "peak_id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "signal": "FLOAT", - }, - genomic_column="interval", - ) - - engine.register_table_schema( - "genes", - { - "gene_id": "INTEGER", - "name": "VARCHAR", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", - }, - genomic_column="interval", - ) - - yield engine - engine.close() diff --git a/tests/generators/test_base.py b/tests/generators/test_base.py index b04b3bb..195365b 100644 --- a/tests/generators/test_base.py +++ b/tests/generators/test_base.py @@ -11,125 +11,45 @@ from sqlglot import exp from sqlglot import parse_one +from giql import Table from giql.dialect import GIQLDialect from giql.expressions import GIQLNearest from giql.generators import BaseGIQLGenerator -from giql.range_parser import IntervalType -from giql.schema import ColumnInfo -from giql.schema import SchemaInfo -from giql.schema import TableSchema +from giql.table import Tables @pytest.fixture -def schema_info(): - """Basic SchemaInfo with a single table containing genomic columns.""" - schema = SchemaInfo() - table = TableSchema(name="variants", columns={}) - table.columns["id"] = ColumnInfo(name="id", type="INTEGER") - table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["variants"] = table - return schema +def tables_info(): + """Basic Tables with a single table containing genomic columns.""" + tables = Tables() + tables.register("variants", Table()) + return tables @pytest.fixture -def schema_with_two_tables(): - """SchemaInfo with two tables for column-to-column tests.""" - schema = SchemaInfo() - - # Table A - table_a = TableSchema(name="features_a", columns={}) - table_a.columns["id"] = ColumnInfo(name="id", type="INTEGER") - table_a.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["features_a"] = table_a - - # Table B - table_b = TableSchema(name="features_b", columns={}) - table_b.columns["id"] = ColumnInfo(name="id", type="INTEGER") - table_b.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["features_b"] = table_b - - return schema +def tables_with_two_tables(): + """Tables with two tables for column-to-column tests.""" + tables = Tables() + tables.register("features_a", Table()) + tables.register("features_b", Table()) + return tables @pytest.fixture -def schema_with_closed_intervals(): - """SchemaInfo with CLOSED interval type for bedtools compatibility tests.""" - schema = SchemaInfo() - table = TableSchema(name="bed_features", columns={}) - table.columns["id"] = ColumnInfo(name="id", type="INTEGER") - table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - interval_type=IntervalType.CLOSED, - ) - schema.tables["bed_features"] = table - return schema +def tables_with_closed_intervals(): + """Tables with CLOSED interval type for bedtools compatibility tests.""" + tables = Tables() + tables.register("bed_features", Table(interval_type="closed")) + return tables @pytest.fixture -def schema_with_peaks_and_genes(): - """Schema info with peaks and genes tables for NEAREST tests.""" - schema = SchemaInfo() - - # Register peaks table - peaks_table = TableSchema(name="peaks", columns={}) - peaks_table.columns["peak_id"] = ColumnInfo(name="peak_id", type="INTEGER") - peaks_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["peaks"] = peaks_table - - # Register genes table - genes_table = TableSchema(name="genes", columns={}) - genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") - genes_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR") - genes_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["genes"] = genes_table - - return schema +def tables_with_peaks_and_genes(): + """Tables with peaks and genes tables for NEAREST tests.""" + tables = Tables() + tables.register("peaks", Table()) + tables.register("genes", Table()) + return tables class TestBaseGIQLGenerator: @@ -137,26 +57,26 @@ class TestBaseGIQLGenerator: def test_instantiation_defaults(self): """ - GIVEN no schema_info provided + GIVEN no tables provided WHEN Generator is instantiated with defaults - THEN Generator has empty SchemaInfo and SUPPORTS_LATERAL is True. + THEN Generator has empty Tables and SUPPORTS_LATERAL is True. """ generator = BaseGIQLGenerator() - assert generator.schema_info is not None - assert generator.schema_info.tables == {} + assert generator.tables is not None + assert "variants" not in generator.tables assert generator.SUPPORTS_LATERAL is True - def test_instantiation_with_schema(self, schema_info): + def test_instantiation_with_tables(self, tables_info): """ - GIVEN a valid SchemaInfo object with table definitions - WHEN Generator is instantiated with schema_info - THEN Generator stores schema_info and can resolve column references. + GIVEN a valid Tables object with table definitions + WHEN Generator is instantiated with tables + THEN Generator stores tables and can resolve column references. """ - generator = BaseGIQLGenerator(schema_info=schema_info) + generator = BaseGIQLGenerator(tables=tables_info) - assert generator.schema_info is schema_info - assert "variants" in generator.schema_info.tables + assert generator.tables is tables_info + assert "variants" in generator.tables def test_instantiation_kwargs_forwarding(self): """ @@ -170,7 +90,7 @@ def test_instantiation_kwargs_forwarding(self): # If kwargs forwarding works, generator should have pretty attribute assert generator.pretty is True - def test_select_sql_basic(self, schema_info): + def test_select_sql_basic(self, tables_info): """ GIVEN a SELECT expression with FROM clause containing a table WHEN select_sql is called @@ -179,13 +99,13 @@ def test_select_sql_basic(self, schema_info): sql = "SELECT * FROM variants" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_info) + generator = BaseGIQLGenerator(tables=tables_info) output = generator.generate(ast) expected = "SELECT * FROM variants" assert output == expected - def test_select_sql_with_alias(self, schema_info): + def test_select_sql_with_alias(self, tables_info): """ GIVEN a SELECT with aliased table (e.g., FROM table AS t) WHEN select_sql is called @@ -194,7 +114,7 @@ def test_select_sql_with_alias(self, schema_info): sql = "SELECT * FROM variants AS v WHERE v.interval INTERSECTS 'chr1:1000-2000'" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_info) + generator = BaseGIQLGenerator(tables=tables_info) output = generator.generate(ast) expected = ( @@ -204,7 +124,7 @@ def test_select_sql_with_alias(self, schema_info): ) assert output == expected - def test_select_sql_with_joins(self, schema_with_two_tables): + def test_select_sql_with_joins(self, tables_with_two_tables): """ GIVEN a SELECT with JOINs WHEN select_sql is called @@ -213,7 +133,7 @@ def test_select_sql_with_joins(self, schema_with_two_tables): sql = "SELECT * FROM features_a AS a JOIN features_b AS b ON a.id = b.id" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = "SELECT * FROM features_a AS a JOIN features_b AS b ON a.id = b.id" @@ -237,7 +157,7 @@ def test_intersects_sql_with_literal(self): ) assert output == expected - def test_intersects_sql_column_join(self, schema_with_two_tables): + def test_intersects_sql_column_join(self, tables_with_two_tables): """ GIVEN an Intersects expression with column-to-column (a.interval INTERSECTS b.interval) @@ -250,7 +170,7 @@ def test_intersects_sql_column_join(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -320,7 +240,7 @@ def test_contains_sql_range_query(self): ) assert output == expected - def test_contains_sql_column_join(self, schema_with_two_tables): + def test_contains_sql_column_join(self, tables_with_two_tables): """ GIVEN a Contains expression with column-to-column join WHEN contains_sql is called @@ -332,7 +252,7 @@ def test_contains_sql_column_join(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -386,7 +306,7 @@ def test_within_sql_with_literal(self): ) assert output == expected - def test_within_sql_column_join(self, schema_with_two_tables): + def test_within_sql_column_join(self, tables_with_two_tables): """ GIVEN a Within expression with column-to-column join WHEN within_sql is called @@ -398,7 +318,7 @@ def test_within_sql_column_join(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -452,7 +372,7 @@ def test_spatialsetpredicate_sql_all(self): ) assert output == expected - def test_giqlnearest_sql_standalone(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_standalone(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest in standalone mode with literal reference WHEN giqlnearest_sql is called @@ -461,7 +381,7 @@ def test_giqlnearest_sql_standalone(self, schema_with_peaks_and_genes): sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) expected = ( @@ -485,7 +405,7 @@ def test_giqlnearest_sql_standalone(self, schema_with_peaks_and_genes): ) assert output == expected - def test_giqlnearest_sql_correlated(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_correlated(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest in correlated mode (LATERAL join context) WHEN giqlnearest_sql is called @@ -497,7 +417,7 @@ def test_giqlnearest_sql_correlated(self, schema_with_peaks_and_genes): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) expected = ( @@ -523,7 +443,7 @@ def test_giqlnearest_sql_correlated(self, schema_with_peaks_and_genes): ) assert output == expected - def test_giqlnearest_sql_with_max_distance(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_with_max_distance(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest with max_distance parameter WHEN giqlnearest_sql is called @@ -536,7 +456,7 @@ def test_giqlnearest_sql_with_max_distance(self, schema_with_peaks_and_genes): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) expected = ( @@ -569,7 +489,7 @@ def test_giqlnearest_sql_with_max_distance(self, schema_with_peaks_and_genes): ) assert output == expected - def test_giqlnearest_sql_stranded(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_stranded(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest with stranded=True WHEN giqlnearest_sql is called @@ -582,7 +502,7 @@ def test_giqlnearest_sql_stranded(self, schema_with_peaks_and_genes): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) expected = ( @@ -623,7 +543,7 @@ def test_giqlnearest_sql_stranded(self, schema_with_peaks_and_genes): ) assert output == expected - def test_giqlnearest_sql_signed(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_signed(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest with signed=True WHEN giqlnearest_sql is called @@ -636,7 +556,7 @@ def test_giqlnearest_sql_signed(self, schema_with_peaks_and_genes): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) expected = ( @@ -662,7 +582,7 @@ def test_giqlnearest_sql_signed(self, schema_with_peaks_and_genes): ) assert output == expected - def test_giqlnearest_sql_no_lateral_support(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_no_lateral_support(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest on a generator with SUPPORTS_LATERAL=False WHEN giqlnearest_sql is called in correlated mode @@ -677,7 +597,7 @@ class NoLateralGenerator(BaseGIQLGenerator): sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3)" ast = parse_one(sql, dialect=GIQLDialect) - generator = NoLateralGenerator(schema_info=schema_with_peaks_and_genes) + generator = NoLateralGenerator(tables=tables_with_peaks_and_genes) with pytest.raises(ValueError, match="LATERAL"): generator.generate(ast) @@ -688,7 +608,7 @@ class NoLateralGenerator(BaseGIQLGenerator): max_distance=st.integers(min_value=1, max_value=10_000_000), ) def test_giqlnearest_sql_parameter_handling_property( - self, schema_with_peaks_and_genes, k, max_distance + self, tables_with_peaks_and_genes, k, max_distance ): """ GIVEN any valid k value (positive integer) and max_distance @@ -701,7 +621,7 @@ def test_giqlnearest_sql_parameter_handling_property( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # k should appear in LIMIT @@ -709,7 +629,7 @@ def test_giqlnearest_sql_parameter_handling_property( # max_distance should appear in WHERE assert str(max_distance) in output - def test_giqldistance_sql_basic(self, schema_with_two_tables): + def test_giqldistance_sql_basic(self, tables_with_two_tables): """ GIVEN a GIQLDistance with two column references WHEN giqldistance_sql is called @@ -721,7 +641,7 @@ def test_giqldistance_sql_basic(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -734,7 +654,7 @@ def test_giqldistance_sql_basic(self, schema_with_two_tables): ) assert output == expected - def test_giqldistance_sql_stranded(self, schema_with_two_tables): + def test_giqldistance_sql_stranded(self, tables_with_two_tables): """ GIVEN a GIQLDistance with stranded=True WHEN giqldistance_sql is called @@ -746,7 +666,7 @@ def test_giqldistance_sql_stranded(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -767,7 +687,7 @@ def test_giqldistance_sql_stranded(self, schema_with_two_tables): ) assert output == expected - def test_giqldistance_sql_signed(self, schema_with_two_tables): + def test_giqldistance_sql_signed(self, tables_with_two_tables): """ GIVEN a GIQLDistance with signed=True WHEN giqldistance_sql is called @@ -779,7 +699,7 @@ def test_giqldistance_sql_signed(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -792,7 +712,7 @@ def test_giqldistance_sql_signed(self, schema_with_two_tables): ) assert output == expected - def test_giqldistance_sql_stranded_and_signed(self, schema_with_two_tables): + def test_giqldistance_sql_stranded_and_signed(self, tables_with_two_tables): """ GIVEN a GIQLDistance with both stranded and signed=True WHEN giqldistance_sql is called @@ -805,7 +725,7 @@ def test_giqldistance_sql_stranded_and_signed(self, schema_with_two_tables): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -826,27 +746,16 @@ def test_giqldistance_sql_stranded_and_signed(self, schema_with_two_tables): ) assert output == expected - def test_giqldistance_with_closed_intervals(self, schema_with_closed_intervals): + def test_giqldistance_with_closed_intervals(self, tables_with_closed_intervals): """ GIVEN intervals from table with CLOSED interval type WHEN Distance calculation is performed THEN Distance includes +1 adjustment (bedtools compatibility). """ - # Create a second table with closed intervals for distance calculation - schema = schema_with_closed_intervals - table_b = TableSchema(name="bed_features_b", columns={}) - table_b.columns["id"] = ColumnInfo(name="id", type="INTEGER") - table_b.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - interval_type=IntervalType.CLOSED, + # Add a second table with closed intervals for distance calculation + tables_with_closed_intervals.register( + "bed_features_b", Table(interval_type="closed") ) - schema.tables["bed_features_b"] = table_b sql = ( "SELECT DISTANCE(a.interval, b.interval) as dist " @@ -854,7 +763,7 @@ def test_giqldistance_with_closed_intervals(self, schema_with_closed_intervals): ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema) + generator = BaseGIQLGenerator(tables=tables_with_closed_intervals) output = generator.generate(ast) expected = ( @@ -901,7 +810,7 @@ def test_error_handling_unknown_operation(self): with pytest.raises(ValueError): generator.generate(ast) - def test_select_sql_join_without_alias(self, schema_with_two_tables): + def test_select_sql_join_without_alias(self, tables_with_two_tables): """ GIVEN a SELECT with JOIN where joined table has no alias WHEN select_sql is called @@ -910,7 +819,7 @@ def test_select_sql_join_without_alias(self, schema_with_two_tables): sql = "SELECT * FROM features_a JOIN features_b ON features_a.id = features_b.id" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) expected = ( @@ -919,7 +828,7 @@ def test_select_sql_join_without_alias(self, schema_with_two_tables): assert output == expected def test_giqlnearest_sql_stranded_literal_with_strand( - self, schema_with_peaks_and_genes + self, tables_with_peaks_and_genes ): """ GIVEN a GIQLNearest with stranded=True and literal reference containing strand @@ -932,7 +841,7 @@ def test_giqlnearest_sql_stranded_literal_with_strand( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Should contain strand literal '+' and strand filtering @@ -940,7 +849,7 @@ def test_giqlnearest_sql_stranded_literal_with_strand( assert 'genes."strand"' in output def test_giqlnearest_sql_stranded_implicit_reference( - self, schema_with_peaks_and_genes + self, tables_with_peaks_and_genes ): """ GIVEN a GIQLNearest in correlated mode with implicit reference and stranded=True @@ -950,7 +859,7 @@ def test_giqlnearest_sql_stranded_implicit_reference( sql = "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, k=3, stranded=true)" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Should have strand columns from both tables @@ -963,31 +872,19 @@ def test_giqlnearest_sql_closed_intervals(self): WHEN giqlnearest_sql is called THEN Distance calculation includes +1 adjustment for bedtools compatibility. """ - schema = SchemaInfo() - genes_closed = TableSchema(name="genes_closed", columns={}) - genes_closed.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") - genes_closed.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - interval_type=IntervalType.CLOSED, - ) - schema.tables["genes_closed"] = genes_closed + tables = Tables() + tables.register("genes_closed", Table(interval_type="closed")) sql = "SELECT * FROM NEAREST(genes_closed, reference='chr1:1000-2000', k=3)" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema) + generator = BaseGIQLGenerator(tables=tables) output = generator.generate(ast) # Should have +1 adjustment for closed intervals assert "+ 1)" in output - def test_giqldistance_sql_literal_first_arg_error(self, schema_with_two_tables): + def test_giqldistance_sql_literal_first_arg_error(self, tables_with_two_tables): """ GIVEN a GIQLDistance with literal range as first argument WHEN giqldistance_sql is called @@ -996,12 +893,12 @@ def test_giqldistance_sql_literal_first_arg_error(self, schema_with_two_tables): sql = "SELECT DISTANCE('chr1:1000-2000', b.interval) as dist FROM features_b b" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) with pytest.raises(ValueError, match="Literal range as first argument"): generator.generate(ast) - def test_giqldistance_sql_literal_second_arg_error(self, schema_with_two_tables): + def test_giqldistance_sql_literal_second_arg_error(self, tables_with_two_tables): """ GIVEN a GIQLDistance with literal range as second argument WHEN giqldistance_sql is called @@ -1010,13 +907,13 @@ def test_giqldistance_sql_literal_second_arg_error(self, schema_with_two_tables) sql = "SELECT DISTANCE(a.interval, 'chr1:1000-2000') as dist FROM features_a a" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) with pytest.raises(ValueError, match="Literal range as second argument"): generator.generate(ast) def test_giqlnearest_sql_missing_outer_table_error( - self, schema_with_peaks_and_genes + self, tables_with_peaks_and_genes ): """ GIVEN a GIQLNearest in correlated mode without reference where outer table @@ -1030,82 +927,33 @@ def test_giqlnearest_sql_missing_outer_table_error( k=exp.Literal.number(3), ) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) with pytest.raises(ValueError, match="Could not find outer table"): generator.giqlnearest_sql(nearest) - def test_giqlnearest_sql_outer_table_not_in_schema(self): + def test_giqlnearest_sql_outer_table_not_in_tables(self): """ - GIVEN a GIQLNearest in correlated mode where outer table is not in schema + GIVEN a GIQLNearest in correlated mode where outer table is not registered WHEN giqlnearest_sql is called THEN ValueError is raised listing the issue. """ - schema = SchemaInfo() - genes_table = TableSchema(name="genes", columns={}) - genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") - genes_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["genes"] = genes_table + tables = Tables() + tables.register("genes", Table()) nearest = GIQLNearest( this=exp.Table(this=exp.Identifier(this="genes")), k=exp.Literal.number(3), ) - generator = BaseGIQLGenerator(schema_info=schema) + generator = BaseGIQLGenerator(tables=tables) generator._alias_to_table = {"unknown_table": "unknown_table"} generator._find_outer_table_in_lateral_join = lambda x: "unknown_table" - with pytest.raises(ValueError, match="not found in schema"): + with pytest.raises(ValueError, match="not found in tables"): generator.giqlnearest_sql(nearest) - def test_giqlnearest_sql_no_genomic_column_in_outer(self): - """ - GIVEN a GIQLNearest in correlated mode where outer table has no genomic column - WHEN giqlnearest_sql is called - THEN ValueError is raised about missing genomic column. - """ - schema = SchemaInfo() - - outer_table = TableSchema(name="outer_table", columns={}) - outer_table.columns["id"] = ColumnInfo(name="id", type="INTEGER") - outer_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR") - schema.tables["outer_table"] = outer_table - - genes_table = TableSchema(name="genes", columns={}) - genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") - genes_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["genes"] = genes_table - - nearest = GIQLNearest( - this=exp.Table(this=exp.Identifier(this="genes")), - k=exp.Literal.number(3), - ) - - generator = BaseGIQLGenerator(schema_info=schema) - generator._alias_to_table = {"outer_table": "outer_table"} - generator._find_outer_table_in_lateral_join = lambda x: "outer_table" - - with pytest.raises(ValueError, match="No genomic column found"): - generator.giqlnearest_sql(nearest) - - def test_giqlnearest_sql_invalid_reference_range(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_invalid_reference_range(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest with invalid/unparseable reference range string WHEN giqlnearest_sql is called @@ -1114,58 +962,38 @@ def test_giqlnearest_sql_invalid_reference_range(self, schema_with_peaks_and_gen sql = "SELECT * FROM NEAREST(genes, reference='invalid_range', k=3)" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) with pytest.raises(ValueError, match="Could not parse reference genomic range"): generator.generate(ast) - def test_giqlnearest_sql_no_schema_error(self): + def test_giqlnearest_sql_no_tables_error(self): """ - GIVEN a GIQLNearest without schema_info provided (empty schema) + GIVEN a GIQLNearest without tables registered WHEN giqlnearest_sql is called THEN ValueError is raised because target table cannot be resolved. """ sql = "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)" ast = parse_one(sql, dialect=GIQLDialect) - # Generator with empty schema - table won't be found + # Generator with empty tables - table won't be found generator = BaseGIQLGenerator() - with pytest.raises(ValueError, match="not found in schema"): + with pytest.raises(ValueError, match="not found in tables"): generator.generate(ast) - def test_giqlnearest_sql_target_not_in_schema(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_target_not_in_tables(self, tables_with_peaks_and_genes): """ - GIVEN a GIQLNearest with target table not found in schema + GIVEN a GIQLNearest with target table not registered WHEN giqlnearest_sql is called THEN ValueError is raised listing available tables. """ sql = "SELECT * FROM NEAREST(unknown_table, reference='chr1:1000-2000', k=3)" ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) - - with pytest.raises(ValueError, match="not found in schema"): - generator.generate(ast) - - def test_giqlnearest_sql_target_no_genomic_column(self): - """ - GIVEN a GIQLNearest with target table having no genomic column defined - WHEN giqlnearest_sql is called - THEN ValueError is raised about missing genomic column. - """ - schema = SchemaInfo() - no_genomic_table = TableSchema(name="no_genomic", columns={}) - no_genomic_table.columns["id"] = ColumnInfo(name="id", type="INTEGER") - no_genomic_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR") - schema.tables["no_genomic"] = no_genomic_table - - sql = "SELECT * FROM NEAREST(no_genomic, reference='chr1:1000-2000', k=3)" - ast = parse_one(sql, dialect=GIQLDialect) - - generator = BaseGIQLGenerator(schema_info=schema) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) - with pytest.raises(ValueError, match="does not have a genomic column"): + with pytest.raises(ValueError, match="not found in tables"): generator.generate(ast) def test_intersects_sql_unqualified_column(self): @@ -1187,7 +1015,7 @@ def test_intersects_sql_unqualified_column(self): assert output == expected def test_giqlnearest_sql_stranded_unqualified_reference( - self, schema_with_peaks_and_genes + self, tables_with_peaks_and_genes ): """ GIVEN a GIQLNearest with stranded=True and unqualified column reference @@ -1204,7 +1032,7 @@ def test_giqlnearest_sql_stranded_unqualified_reference( stranded=exp.Boolean(this=True), ) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.giqlnearest_sql(nearest) # Should produce valid output with unqualified strand column @@ -1212,7 +1040,7 @@ def test_giqlnearest_sql_stranded_unqualified_reference( # The strand column should be unqualified (no table prefix) assert '"strand"' in output - def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes): + def test_giqlnearest_sql_identifier_target(self, tables_with_peaks_and_genes): """ GIVEN a GIQLNearest where target is an Identifier (not Table or Column) WHEN giqlnearest_sql is called @@ -1227,7 +1055,7 @@ def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes): k=exp.Literal.number(3), ) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.giqlnearest_sql(nearest) # Should succeed and produce valid SQL @@ -1239,7 +1067,7 @@ def test_giqlnearest_sql_identifier_target(self, schema_with_peaks_and_genes): ) @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) def test_giqldistance_stranded_param_truthy_values_property( - self, schema_with_two_tables, bool_repr + self, tables_with_two_tables, bool_repr ): """ GIVEN a GIQLDistance with stranded parameter in various truthy representations @@ -1252,7 +1080,7 @@ def test_giqldistance_stranded_param_truthy_values_property( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) # Should include strand handling (NULL checks for strand columns) @@ -1264,7 +1092,7 @@ def test_giqldistance_stranded_param_truthy_values_property( ) @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) def test_giqldistance_stranded_param_falsy_values_property( - self, schema_with_two_tables, bool_repr + self, tables_with_two_tables, bool_repr ): """ GIVEN a GIQLDistance with stranded parameter in various falsy representations @@ -1277,7 +1105,7 @@ def test_giqldistance_stranded_param_falsy_values_property( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) # Should NOT include strand NULL checks (basic distance) @@ -1288,7 +1116,7 @@ def test_giqldistance_stranded_param_falsy_values_property( ) @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) def test_giqldistance_signed_param_truthy_values_property( - self, schema_with_two_tables, bool_repr + self, tables_with_two_tables, bool_repr ): """ GIVEN a GIQLDistance with signed parameter in various truthy representations @@ -1301,7 +1129,7 @@ def test_giqldistance_signed_param_truthy_values_property( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) # Signed distance has negative sign for upstream intervals @@ -1312,7 +1140,7 @@ def test_giqldistance_signed_param_truthy_values_property( ) @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) def test_giqldistance_signed_param_falsy_values_property( - self, schema_with_two_tables, bool_repr + self, tables_with_two_tables, bool_repr ): """ GIVEN a GIQLDistance with signed parameter in various falsy representations @@ -1325,7 +1153,7 @@ def test_giqldistance_signed_param_falsy_values_property( ) ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_two_tables) + generator = BaseGIQLGenerator(tables=tables_with_two_tables) output = generator.generate(ast) # Unsigned distance has no negative sign (both ELSE branches are positive) diff --git a/tests/integration/bedtools/__init__.py b/tests/integration/bedtools/__init__.py deleted file mode 100644 index 0a2c30b..0000000 --- a/tests/integration/bedtools/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Bedtools integration tests for GIQL. - -This package contains integration tests that validate GIQL query results -against bedtools command outputs using simulated genomic datasets. -""" diff --git a/tests/integration/bedtools/conftest.py b/tests/integration/bedtools/conftest.py deleted file mode 100644 index af9387f..0000000 --- a/tests/integration/bedtools/conftest.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Pytest fixtures for bedtools integration tests. - -This module provides shared fixtures for: -- DuckDB connections -- Interval generators -""" - -import pytest - -from .utils.data_models import IntervalGeneratorConfig -from .utils.interval_generator import IntervalGenerator - - -@pytest.fixture(scope="function") -def duckdb_connection(): - """Provide clean DuckDB connection for each test. - - Yields: - DuckDB connection to in-memory database - - Note: - Each test gets a fresh database with no shared state. - Connection is automatically closed after test. - """ - try: - import duckdb - except ImportError: - pytest.skip("DuckDB not installed. Install with: pip install duckdb") - - conn = duckdb.connect(":memory:") - yield conn - conn.close() - - -@pytest.fixture(scope="function") -def interval_generator(): - """Provide configured interval generator. - - Returns: - IntervalGenerator with deterministic seed - - Note: - Uses seed=42 for reproducible test data. - """ - config = IntervalGeneratorConfig(seed=42) - return IntervalGenerator(config) diff --git a/tests/integration/bedtools/test_intersect.py b/tests/integration/bedtools/test_intersect.py deleted file mode 100644 index cfc4394..0000000 --- a/tests/integration/bedtools/test_intersect.py +++ /dev/null @@ -1,313 +0,0 @@ -"""Integration tests for GIQL INTERSECTS operator. - -These tests validate that GIQL's INTERSECTS operator produces identical -results to bedtools intersect command. -""" - -from giql import GIQLEngine - -from .utils.bed_export import load_intervals -from .utils.bedtools_wrapper import intersect -from .utils.comparison import compare_results -from .utils.data_models import GenomicInterval - - -def _setup_giql_engine(duckdb_connection): - """Helper to set up GIQL engine with table schemas.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=False) - engine.conn = duckdb_connection - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "BIGINT", - "strand": "VARCHAR", - } - - engine.register_table_schema("intervals_a", schema, genomic_column="interval") - engine.register_table_schema("intervals_b", schema, genomic_column="interval") - - return engine - - -def test_intersect_basic_overlap(duckdb_connection, interval_generator): - """Test INTERSECTS predicate finds overlapping intervals. - - Given: - Two tables with genomic intervals where some intervals overlap - When: - A GIQL query uses INTERSECTS predicate in WHERE clause - Then: - Results match bedtools intersect output exactly - """ - # Arrange: Create overlapping intervals - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr1", 150, 250, "a2", 200, "+"), - GenomicInterval("chr1", 300, 400, "a3", 150, "-"), - ] - intervals_b = [ - GenomicInterval("chr1", 180, 220, "b1", 100, "+"), - GenomicInterval("chr1", 350, 450, "b2", 200, "-"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_partial_overlap(duckdb_connection, interval_generator): - """Test INTERSECTS with partially overlapping intervals. - - Given: - Intervals with partial overlaps - When: - INTERSECTS query is executed - Then: - Results match bedtools partial overlap behavior - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 250, "a1", 100, "+"), - GenomicInterval("chr1", 300, 400, "a2", 200, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 200, 350, "b1", 150, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_no_overlap(duckdb_connection, interval_generator): - """Test INTERSECTS with non-overlapping intervals. - - Given: - Two sets of intervals with no overlaps - When: - INTERSECTS query is executed - Then: - No results returned (matches bedtools empty output) - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 300, 400, "b1", 150, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_adjacent_intervals(duckdb_connection, interval_generator): - """Test INTERSECTS with adjacent (touching) intervals. - - Given: - Intervals that touch but don't overlap - When: - INTERSECTS query is executed - Then: - No results returned (adjacent != overlapping) - """ - # Arrange: Adjacent intervals (end of a1 == start of b1) - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 200, 300, "b1", 150, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_multiple_chromosomes(duckdb_connection, interval_generator): - """Test INTERSECTS across multiple chromosomes. - - Given: - Intervals on different chromosomes - When: - INTERSECTS query is executed - Then: - Only same-chromosome overlaps are returned - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr2", 150, 250, "a2", 200, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 150, 250, "b1", 150, "+"), - GenomicInterval("chr2", 200, 300, "b2", 100, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) diff --git a/tests/integration/bedtools/test_merge.py b/tests/integration/bedtools/test_merge.py deleted file mode 100644 index 51fea31..0000000 --- a/tests/integration/bedtools/test_merge.py +++ /dev/null @@ -1,224 +0,0 @@ -"""Integration tests for GIQL MERGE operator. - -These tests validate that GIQL's MERGE operator produces identical -results to bedtools merge command. -""" - -from giql import GIQLEngine - -from .utils.bed_export import load_intervals -from .utils.bedtools_wrapper import merge -from .utils.comparison import compare_results -from .utils.data_models import GenomicInterval - - -def _setup_giql_engine(duckdb_connection): - """Helper to set up GIQL engine with table schema.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=False) - engine.conn = duckdb_connection - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "BIGINT", - "strand": "VARCHAR", - } - - engine.register_table_schema( - "intervals", - schema, - genomic_column="interval", - ) - - return engine - - -def test_merge_adjacent_intervals(duckdb_connection): - """Test MERGE with adjacent intervals. - - Given: - A set of adjacent intervals - When: - MERGE operator is applied - Then: - Adjacent intervals are merged into single intervals - """ - # Arrange - intervals = [ - GenomicInterval("chr1", 100, 200, "i1", 100, "+"), - GenomicInterval("chr1", 200, 300, "i2", 150, "+"), - GenomicInterval("chr1", 300, 400, "i3", 200, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = merge( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT MERGE(interval) - FROM intervals - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_merge_overlapping_intervals(duckdb_connection): - """Test MERGE with overlapping intervals. - - Given: - A set of overlapping intervals - When: - MERGE operator is applied - Then: - Overlapping intervals are merged - """ - # Arrange - intervals = [ - GenomicInterval("chr1", 100, 250, "i1", 100, "+"), - GenomicInterval("chr1", 200, 350, "i2", 150, "+"), - GenomicInterval("chr1", 300, 400, "i3", 200, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = merge( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT MERGE(interval) - FROM intervals - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_merge_separated_intervals(duckdb_connection): - """Test MERGE with separated intervals. - - Given: - Intervals with gaps between them - When: - MERGE operator is applied - Then: - Separated intervals remain separate - """ - # Arrange - intervals = [ - GenomicInterval("chr1", 100, 200, "i1", 100, "+"), - GenomicInterval("chr1", 300, 400, "i2", 150, "+"), - GenomicInterval("chr1", 500, 600, "i3", 200, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = merge( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT MERGE(interval) - FROM intervals - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_merge_multiple_chromosomes(duckdb_connection): - """Test MERGE across multiple chromosomes. - - Given: - Intervals on different chromosomes - When: - MERGE operator is applied - Then: - Merging occurs per chromosome - """ - # Arrange - intervals = [ - GenomicInterval("chr1", 100, 200, "i1", 100, "+"), - GenomicInterval("chr1", 180, 300, "i2", 150, "+"), - GenomicInterval("chr2", 100, 200, "i3", 100, "+"), - GenomicInterval("chr2", 180, 300, "i4", 150, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = merge( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals] - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT MERGE(interval) - FROM intervals - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) diff --git a/tests/integration/bedtools/test_nearest.py b/tests/integration/bedtools/test_nearest.py deleted file mode 100644 index 30505ac..0000000 --- a/tests/integration/bedtools/test_nearest.py +++ /dev/null @@ -1,468 +0,0 @@ -"""Integration tests for GIQL NEAREST operator. - -These tests validate that GIQL's NEAREST operator produces identical -results to bedtools closest command. -""" - -from giql import GIQLEngine - -from .utils.bed_export import load_intervals -from .utils.bedtools_wrapper import closest -from .utils.comparison import compare_results -from .utils.data_models import GenomicInterval - - -def _setup_giql_engine(duckdb_connection): - """Helper to set up GIQL engine with table schemas.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=False) - engine.conn = duckdb_connection - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "BIGINT", - "strand": "VARCHAR", - } - - engine.register_table_schema( - "intervals_a", - schema, - genomic_column="interval", - interval_type="closed", # Match bedtools distance calculation - ) - engine.register_table_schema( - "intervals_b", - schema, - genomic_column="interval", - interval_type="closed", # Match bedtools distance calculation - ) - - return engine - - -def test_nearest_non_overlapping(duckdb_connection): - """Test NEAREST with non-overlapping intervals. - - Given: - Two sets of non-overlapping intervals - When: - NEAREST operator is applied - Then: - Each interval in A finds its closest neighbor in B - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr1", 500, 600, "a2", 150, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 250, 300, "b1", 100, "+"), - GenomicInterval("chr1", 350, 400, "b2", 150, "+"), - GenomicInterval("chr1", 700, 800, "b3", 200, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_nearest_multiple_candidates(duckdb_connection): - """Test NEAREST with equidistant intervals. - - Given: - Interval in A with multiple equidistant intervals in B - When: - NEAREST operator is applied - Then: - Bedtools reports one of the equidistant intervals (tie-breaking behavior) - """ - # Arrange: a1 is equidistant from b1 and b2 - intervals_a = [ - GenomicInterval("chr1", 300, 400, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Distance: 100 bp - GenomicInterval("chr1", 500, 600, "b2", 150, "+"), # Distance: 100 bp - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results (allowing tie-breaking differences) - assert len(giql_result) == len(bedtools_result) - # The nearest interval is either b1 or b2 (both equidistant) - assert giql_result[0][3] == "a1" # Interval A name - assert giql_result[0][9] in ("b1", "b2") # Nearest could be either - - -def test_nearest_cross_chromosome(duckdb_connection): - """Test NEAREST across multiple chromosomes. - - Given: - Intervals on different chromosomes - When: - NEAREST operator is applied - Then: - Each interval finds nearest only on same chromosome - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr2", 100, 200, "a2", 150, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 300, 400, "b1", 100, "+"), - GenomicInterval("chr2", 300, 400, "b2", 150, "+"), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_nearest_boundary_cases(duckdb_connection): - """Test NEAREST with boundary cases. - - Given: - Adjacent intervals (touching but not overlapping) - When: - NEAREST operator is applied - Then: - Adjacent intervals are reported as nearest (distance = 0) - """ - # Arrange: a1 ends where b1 starts (adjacent, distance = 0) - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 200, 300, "b1", 150, "+"), # Adjacent to a1 - GenomicInterval("chr1", 500, 600, "b2", 200, "+"), # Far away - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_nearest_signed_distance(duckdb_connection): - """Test NEAREST with signed=true for directional distance. - - Given: - Intervals in A with an upstream neighbor in B - When: - NEAREST operator is applied with signed=true - Then: - Distance is negative for upstream B intervals (B ends before A starts) - This matches bedtools closest -D ref behavior - """ - # Arrange: a1 has an upstream neighbor (b1) - # a1 at [300-400], b1 at [100-200] (upstream, distance = -(300-200+1) = -101) - intervals_a = [ - GenomicInterval("chr1", 300, 400, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream of a1 - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation with signed distance (-D ref) - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - signed=True, - ) - - # Act: Execute GIQL query with signed=true - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT - a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand, - b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand, - distance - FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Both should return 1 row - assert len(giql_result) == len(bedtools_result) == 1 - - giql_distance = giql_result[0][12] - bedtools_distance = bedtools_result[0][12] - - # Verify the distance is negative (upstream) - assert giql_distance < 0, f"Expected negative distance, got {giql_distance}" - assert bedtools_distance < 0, ( - f"Expected negative bedtools distance, got {bedtools_distance}" - ) - - # Verify distances match - assert giql_distance == bedtools_distance, ( - f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}" - ) - - -def test_nearest_signed_distance_upstream_only(duckdb_connection): - """Test NEAREST with signed=true filtering for upstream features only. - - Given: - Intervals in A with neighbors in B, using signed=true - When: - Filtering for negative distance (upstream features) - Then: - Only upstream B intervals are returned (distance < 0) - """ - # Arrange - # a1 at [500-600] - # b1 at [100-200]: upstream, distance = -(500 - 200 + 1) = -301 (closed interval +1) - # b2 at [300-400]: upstream, distance = -(500 - 400 + 1) = -101 (closed interval +1) - # b3 at [700-800]: downstream, distance = +(700 - 600 + 1) = +101 (closed interval +1) - intervals_a = [ - GenomicInterval("chr1", 500, 600, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 100, 200, "b1", 100, "+"), # Upstream - GenomicInterval("chr1", 300, 400, "b2", 150, "+"), # Upstream - GenomicInterval("chr1", 700, 800, "b3", 200, "+"), # Downstream - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query filtering for upstream only (negative distance) - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT - a.name AS a_name, - b.name AS b_name, - distance - FROM intervals_a a, NEAREST(intervals_b, k=3, signed=true) b - WHERE distance < 0 - ORDER BY distance DESC - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Should only return upstream intervals (b1 and b2) - assert len(giql_result) == 2 - # All distances should be negative - for row in giql_result: - assert row[2] < 0, f"Expected negative distance, got {row[2]}" - # b2 should be first (closer upstream, distance -101 with closed interval +1) - assert giql_result[0][1] == "b2" - assert giql_result[0][2] == -101 - # b1 should be second (farther upstream, distance -301 with closed interval +1) - assert giql_result[1][1] == "b1" - assert giql_result[1][2] == -301 - - -def test_nearest_signed_distance_downstream(duckdb_connection): - """Test NEAREST with signed=true for downstream features. - - Given: - Intervals in A with a downstream neighbor in B - When: - NEAREST operator is applied with signed=true - Then: - Distance is positive for downstream B intervals (B starts after A ends) - This matches bedtools closest -D ref behavior - """ - # Arrange: a1 has a downstream neighbor (b1) - # a1 at [100-200], b1 at [300-400] (downstream, distance = 300-200+1 = 101) - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 300, 400, "b1", 100, "+"), # Downstream of a1 - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation with signed distance (-D ref) - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - signed=True, - ) - - # Act: Execute GIQL query with signed=true - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT - a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand, - b.chromosome, b.start_pos, b.end_pos, b.name, b.score, b.strand, - distance - FROM intervals_a a, NEAREST(intervals_b, k=1, signed=true) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Both should return 1 row - assert len(giql_result) == len(bedtools_result) == 1 - - giql_distance = giql_result[0][12] - bedtools_distance = bedtools_result[0][12] - - # Verify the distance is positive (downstream) - assert giql_distance > 0, f"Expected positive distance, got {giql_distance}" - assert bedtools_distance > 0, "Expected positive bedtools distance" - - # Verify distances match - assert giql_distance == bedtools_distance, ( - f"Distance mismatch: GIQL={giql_distance}, bedtools={bedtools_distance}" - ) diff --git a/tests/integration/bedtools/test_strand_aware.py b/tests/integration/bedtools/test_strand_aware.py deleted file mode 100644 index 11075c6..0000000 --- a/tests/integration/bedtools/test_strand_aware.py +++ /dev/null @@ -1,471 +0,0 @@ -"""Integration tests for GIQL strand-aware operations. - -These tests validate that GIQL correctly handles strand-specific interval -operations, matching bedtools behavior with -s and -S flags. -""" - -from giql import GIQLEngine - -from .utils.bed_export import load_intervals -from .utils.bedtools_wrapper import closest -from .utils.bedtools_wrapper import intersect -from .utils.bedtools_wrapper import merge -from .utils.comparison import compare_results -from .utils.data_models import GenomicInterval - - -def _setup_giql_engine(duckdb_connection): - """Helper to set up GIQL engine with table schemas.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=False) - engine.conn = duckdb_connection - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "BIGINT", - "strand": "VARCHAR", - } - - for table_name in ["intervals_a", "intervals_b", "intervals"]: - engine.register_table_schema( - table_name, - schema, - genomic_column="interval", - interval_type="closed", # Match bedtools distance calculation - ) - - return engine - - -def test_intersect_same_strand(duckdb_connection): - """Test INTERSECTS with same-strand requirement. - - Given: - Intervals on both same and opposite strands - When: - INTERSECTS with same-strand requirement is applied - Then: - Only same-strand overlaps are reported - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr1", 300, 400, "a2", 150, "-"), - ] - intervals_b = [ - GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Overlaps a1 (same +) - GenomicInterval("chr1", 350, 450, "b2", 150, "-"), # Overlaps a2 (same -) - GenomicInterval("chr1", 150, 250, "b3", 200, "-"), # Overlaps a1 (opposite) - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools with same-strand requirement - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - strand_mode="same", - ) - - # Act: Execute GIQL query with same-strand filter - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_opposite_strand(duckdb_connection): - """Test INTERSECTS with opposite-strand requirement. - - Given: - Intervals on both same and opposite strands - When: - INTERSECTS with opposite-strand requirement is applied - Then: - Only opposite-strand overlaps are reported - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr1", 300, 400, "a2", 150, "-"), - ] - intervals_b = [ - GenomicInterval("chr1", 150, 250, "b1", 100, "-"), # Overlaps a1 (opposite) - GenomicInterval("chr1", 350, 450, "b2", 150, "+"), # Overlaps a2 (opposite) - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools with opposite-strand requirement - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - strand_mode="opposite", - ) - - # Act: Execute GIQL query with opposite-strand filter - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand != b.strand - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_ignore_strand(duckdb_connection): - """Test INTERSECTS ignoring strand information. - - Given: - Intervals with various strand combinations - When: - INTERSECTS without strand requirements is applied - Then: - All overlaps are reported regardless of strand - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 150, 250, "b1", 100, "+"), # Same strand - GenomicInterval("chr1", 150, 250, "b2", 150, "-"), # Opposite strand - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools without strand requirements - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query without strand filter - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_intersect_mixed_strands(duckdb_connection): - """Test INTERSECTS with mixed strand scenarios. - - Given: - Complex scenario with +, -, and unstranded intervals - When: - INTERSECTS with same-strand requirement is applied - Then: - Results correctly handle strand matching logic - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - GenomicInterval("chr1", 300, 400, "a2", 150, "-"), - GenomicInterval("chr1", 500, 600, "a3", 200, "."), # Unstranded - ] - intervals_b = [ - GenomicInterval("chr1", 150, 250, "b1", 100, "+"), - GenomicInterval("chr1", 350, 450, "b2", 150, "-"), - GenomicInterval("chr1", 550, 650, "b3", 200, "."), - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools with same-strand requirement - bedtools_result = intersect( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - strand_mode="same", - ) - - # Act: Execute GIQL query with same-strand filter - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT DISTINCT a.* - FROM intervals_a a, intervals_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - AND a.strand != '.' - AND b.strand != '.' - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_nearest_same_strand(duckdb_connection): - """Test NEAREST with same-strand requirement. - - Given: - Intervals with candidates on same and opposite strands - When: - NEAREST with same-strand requirement is applied - Then: - Only same-strand nearest intervals are reported - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 250, 300, "b1", 100, "+"), # Nearest on same strand - GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer but opposite - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools with same-strand requirement - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - strand_mode="same", - ) - - # Act: Execute GIQL query with same-strand NEAREST - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1, stranded=true) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_nearest_opposite_strand(duckdb_connection): - """Test NEAREST with opposite-strand requirement. - - Given: - Intervals with candidates on same and opposite strands - When: - NEAREST with opposite-strand requirement is applied - Then: - Only opposite-strand nearest intervals are reported - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 250, 300, "b1", 100, "-"), # Nearest opposite strand - GenomicInterval("chr1", 220, 240, "b2", 150, "+"), # Closer but same strand - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools with opposite-strand requirement - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - strand_mode="opposite", - ) - - # Note: GIQL may not have direct opposite-strand support - # This test documents the expected behavior - assert len(bedtools_result) == 1 - assert bedtools_result[0][3] == "a1" - assert bedtools_result[0][9] == "b1" - - -def test_nearest_ignore_strand(duckdb_connection): - """Test NEAREST ignoring strand information. - - Given: - Intervals on different strands - When: - NEAREST without strand requirements is applied - Then: - Closest interval is found regardless of strand - """ - # Arrange - intervals_a = [ - GenomicInterval("chr1", 100, 200, "a1", 100, "+"), - ] - intervals_b = [ - GenomicInterval("chr1", 250, 300, "b1", 100, "+"), - GenomicInterval("chr1", 220, 240, "b2", 150, "-"), # Closer - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals_a", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - ) - load_intervals( - duckdb_connection, - "intervals_b", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute bedtools operation using pybedtools without strand requirements - bedtools_result = closest( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_a], - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals_b], - ) - - # Act: Execute GIQL query without strand filter - engine = _setup_giql_engine(duckdb_connection) - giql_query = """ - SELECT a.*, b.* - FROM intervals_a a, NEAREST(intervals_b, k=1) b - ORDER BY a.chromosome, a.start_pos - """ - sql = engine.transpile(giql_query) - giql_result = duckdb_connection.execute(sql).fetchall() - - # Assert: Compare GIQL and bedtools results - comparison = compare_results(giql_result, bedtools_result) - assert comparison.match, ( - f"GIQL results don't match bedtools:\n" - f"Differences: {comparison.differences}\n" - f"GIQL rows: {len(giql_result)}, bedtools rows: {len(bedtools_result)}" - ) - - -def test_merge_strand_specific(duckdb_connection): - """Test MERGE with strand-specific behavior. - - Given: - Overlapping intervals on different strands - When: - MERGE with strand-specific flag is applied - Then: - Intervals are merged per-strand (same-strand intervals merge together) - """ - # Arrange - overlapping intervals on both strands - intervals = [ - GenomicInterval("chr1", 100, 200, "i1", 100, "+"), - GenomicInterval("chr1", 150, 250, "i2", 150, "+"), # Overlaps i1 (same +) - GenomicInterval("chr1", 120, 180, "i3", 200, "-"), # Overlaps i1 (opposite) - GenomicInterval("chr1", 160, 240, "i4", 100, "-"), # Overlaps i2 (opposite) - ] - - # Load into DuckDB - load_intervals( - duckdb_connection, - "intervals", - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - ) - - # Act: Execute bedtools operation using pybedtools with strand-specific merging - bedtools_result = merge( - [(i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in intervals], - strand_mode="same", - ) - - # Note: GIQL MERGE with strand grouping would require GROUP BY strand - # This test documents the expected behavior - assert len(bedtools_result) >= 2 # At least one per strand diff --git a/tests/integration/bedtools/utils/__init__.py b/tests/integration/bedtools/utils/__init__.py deleted file mode 100644 index 99a414e..0000000 --- a/tests/integration/bedtools/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utilities for bedtools integration testing.""" diff --git a/tests/integration/bedtools/utils/bed_export.py b/tests/integration/bedtools/utils/bed_export.py deleted file mode 100644 index cd5a5c8..0000000 --- a/tests/integration/bedtools/utils/bed_export.py +++ /dev/null @@ -1,40 +0,0 @@ -"""DuckDB loading utilities for genomic intervals. - -This module provides functions for loading genomic intervals into DuckDB tables. -""" - -from typing import List -from typing import Tuple - - -def load_intervals( - conn, - table_name: str, - intervals: List[Tuple[str, int, int, str | None, int | None, str | None]], -): - """Load intervals into DuckDB table. - - Args: - conn: DuckDB connection - table_name: Name of table to create - intervals: List of (chrom, start, end, name, score, strand) tuples - where name, score, and strand can be None - - Note: - Creates a new table with GIQL's default column names for genomic data: - chromosome, start_pos, end_pos, name, score, strand - """ - # Create table with GIQL's default column names - conn.execute(f""" - CREATE TABLE {table_name} ( - chromosome VARCHAR, - start_pos INTEGER, - end_pos INTEGER, - name VARCHAR, - score INTEGER, - strand VARCHAR - ) - """) - - # Insert intervals - conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", intervals) diff --git a/tests/integration/bedtools/utils/bedtools_wrapper.py b/tests/integration/bedtools/utils/bedtools_wrapper.py deleted file mode 100644 index 21d201f..0000000 --- a/tests/integration/bedtools/utils/bedtools_wrapper.py +++ /dev/null @@ -1,303 +0,0 @@ -"""Pybedtools wrapper for genomic interval operations. - -This module provides functions for: -- Creating BedTool objects from interval data -- Executing bedtools operations via pybedtools -- Converting results to comparable formats -""" - -from typing import List -from typing import Tuple - -import pybedtools - -# Strand flag constants for bedtools commands -STRAND_SAME = True # Require same strand (pybedtools uses True for -s) -# Require opposite strands (pybedtools uses "opposite" for -S) -STRAND_OPPOSITE = "opposite" - - -class BedtoolsError(Exception): - """Raised when bedtools operation fails.""" - - pass - - -def create_bedtool(intervals: List[Tuple]) -> pybedtools.BedTool: - """Create BedTool object from interval tuples. - - Args: - intervals: List of tuples, each containing: - - (chrom, start, end) for BED3 format - - (chrom, start, end, name, score, strand) for BED6 format - - Returns: - pybedtools.BedTool object - - Example: - >>> intervals = [("chr1", 100, 200, "a1", 100, "+")] - >>> bt = create_bedtool(intervals) - """ - # Convert tuples to BED format strings - bed_strings = [] - for interval in intervals: - if len(interval) == 3: - # BED3 format - bed_strings.append(f"{interval[0]}\t{interval[1]}\t{interval[2]}") - elif len(interval) >= 6: - # BED6 format - chrom, start, end, name, score, strand = interval[:6] - # Handle None values - name = name if name is not None else "." - score = score if score is not None else 0 - strand = strand if strand is not None else "." - bed_strings.append(f"{chrom}\t{start}\t{end}\t{name}\t{score}\t{strand}") - else: - raise ValueError(f"Invalid interval format: {interval}") - - bed_string = "\n".join(bed_strings) - return pybedtools.BedTool(bed_string, from_string=True) - - -def intersect( - intervals_a: List[Tuple], - intervals_b: List[Tuple], - strand_mode: str | None = None, -) -> List[Tuple]: - """Find overlapping intervals using bedtools intersect. - - Args: - intervals_a: First set of intervals - intervals_b: Second set of intervals - strand_mode: Strand requirement ('same', 'opposite', or None for ignore) - - Returns: - List of tuples matching intervals_a format - - Example: - >>> a = [("chr1", 100, 200, "a1", 100, "+")] - >>> b = [("chr1", 150, 250, "b1", 100, "+")] - >>> result = intersect(a, b) - """ - try: - bt_a = create_bedtool(intervals_a) - bt_b = create_bedtool(intervals_b) - - # Build kwargs for intersect - # Use -u (unique) to return each interval from A only once - # This matches GIQL's DISTINCT behavior - kwargs = {"u": True} - if strand_mode == "same": - kwargs["s"] = True - elif strand_mode == "opposite": - kwargs["S"] = True - - # Perform intersection - result = bt_a.intersect(bt_b, **kwargs) - - # Convert to tuples - return bedtool_to_tuples(result) - - except Exception as e: - raise BedtoolsError(f"Intersect operation failed: {e}") - - -def merge(intervals: List[Tuple], strand_mode: str | None = None) -> List[Tuple]: - """Merge overlapping intervals using bedtools merge. - - Args: - intervals: List of intervals to merge - strand_mode: Strand requirement ('same' to merge per-strand, None for ignore) - - Returns: - List of tuples in BED3 format (chrom, start, end) - - Example: - >>> intervals = [ - ... ("chr1", 100, 200, "a1", 100, "+"), - ... ("chr1", 180, 300, "a2", 100, "+"), - ... ] - >>> result = merge(intervals) - >>> # Returns: [("chr1", 100, 300)] - """ - try: - bt = create_bedtool(intervals) - - # Sort before merging (required by bedtools merge) - bt_sorted = bt.sort() - - # Build kwargs for merge - kwargs = {} - if strand_mode == "same": - kwargs["s"] = True - - # Perform merge - result = bt_sorted.merge(**kwargs) - - # Convert to tuples (merge returns BED3 format) - return bedtool_to_tuples(result, format="bed3") - - except Exception as e: - raise BedtoolsError(f"Merge operation failed: {e}") - - -def closest( - intervals_a: List[Tuple], - intervals_b: List[Tuple], - strand_mode: str | None = None, - k: int = 1, - signed: bool = False, -) -> List[Tuple]: - """Find closest intervals using bedtools closest. - - Args: - intervals_a: Query intervals - intervals_b: Database intervals to search - strand_mode: Strand requirement ('same', 'opposite', or None for ignore) - k: Number of closest intervals to report (default: 1) - signed: If True, return signed distances (negative for upstream B, - positive for downstream B). Uses bedtools -D ref mode. - - Returns: - List of tuples with format: (a_fields..., b_fields..., distance) - - Example: - >>> a = [("chr1", 100, 200, "a1", 100, "+")] - >>> b = [("chr1", 300, 400, "b1", 100, "+")] - >>> result = closest(a, b) - >>> # Returns intervals from a and b with distance - """ - try: - bt_a = create_bedtool(intervals_a) - bt_b = create_bedtool(intervals_b) - - # Sort inputs (required for -t flag) - bt_a = bt_a.sort() - bt_b = bt_b.sort() - - # Build kwargs for closest - # -d reports unsigned distance, -D ref reports signed distance - if signed: - # Use -D ref for signed distance relative to reference (A) - # Negative = B is upstream of A, Positive = B is downstream of A - kwargs = {"D": "ref", "t": "first"} - else: - kwargs = {"d": True, "t": "first"} - - if k > 1: - kwargs["k"] = k - if strand_mode == "same": - kwargs["s"] = True - elif strand_mode == "opposite": - kwargs["S"] = True - - # Perform closest - result = bt_a.closest(bt_b, **kwargs) - - # Convert to tuples (closest returns concatenated fields + distance) - return bedtool_to_tuples(result, format="closest") - - except Exception as e: - raise BedtoolsError(f"Closest operation failed: {e}") - - -def bedtool_to_tuples(bedtool: pybedtools.BedTool, format: str = "bed6") -> List[Tuple]: - """Convert BedTool object to list of tuples. - - Args: - bedtool: pybedtools.BedTool object - format: Expected format ('bed3', 'bed6', or 'closest') - - Returns: - List of tuples matching the format - - Note: - - bed3: (chrom, start, end) - - bed6: (chrom, start, end, name, score, strand) - - closest: (chrom_a, start_a, end_a, name_a, score_a, strand_a, - chrom_b, start_b, end_b, name_b, score_b, strand_b, distance) - """ - rows = [] - - for interval in bedtool: - fields = interval.fields - - if format == "bed3": - chrom = fields[0] - start = int(fields[1]) - end = int(fields[2]) - rows.append((chrom, start, end)) - - elif format == "bed6": - if len(fields) < 6: - # Pad with defaults if needed - while len(fields) < 6: - if len(fields) == 3: - fields.append(".") # name - elif len(fields) == 4: - fields.append("0") # score - elif len(fields) == 5: - fields.append(".") # strand - - chrom = fields[0] - start = int(fields[1]) - end = int(fields[2]) - name = fields[3] if fields[3] != "." else None - score = int(fields[4]) if fields[4] != "." else None - strand = fields[5] if fields[5] != "." else None - - rows.append((chrom, start, end, name, score, strand)) - - elif format == "closest": - # Closest returns: a_fields + b_fields + distance - # For BED6: 6 fields for a, 6 fields for b, 1 distance = 13 total - if len(fields) >= 13: - # Parse all fields as-is, converting appropriate ones to int - row = [] - for i, field in enumerate(fields): - # Positions (1, 2, 7, 8) and distance (12) should be int - if i in (1, 2, 7, 8, 12): - row.append(int(field)) - # Scores (4, 10) should be int if not "." - elif i in (4, 10): - row.append(int(field) if field != "." else None) - # Names (3, 9) and strands (5, 11) should be None if "." - elif i in (3, 5, 9, 11): - row.append(field if field != "." else None) - else: - row.append(field) - rows.append(tuple(row)) - else: - raise ValueError( - f"Unexpected number of fields for closest: {len(fields)}" - ) - - else: - raise ValueError(f"Unsupported format: {format}") - - return rows - - -def add_strand_flag(kwargs: dict, strand_mode: str | None) -> dict: - """Add strand flag to bedtools kwargs. - - Args: - kwargs: Base kwargs dictionary - strand_mode: Strand requirement ('same', 'opposite', or None for ignore) - - Returns: - Updated kwargs dictionary with strand flag - - Example: - >>> kwargs = add_strand_flag({}, "same") - >>> # Returns: {"s": True} - """ - updated_kwargs = kwargs.copy() - - if strand_mode == "same": - updated_kwargs["s"] = True - elif strand_mode == "opposite": - updated_kwargs["S"] = True - # None or other values = ignore strand (no flag added) - - return updated_kwargs diff --git a/tests/integration/bedtools/utils/comparison.py b/tests/integration/bedtools/utils/comparison.py deleted file mode 100644 index caa4bd2..0000000 --- a/tests/integration/bedtools/utils/comparison.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Result comparison logic for GIQL vs bedtools outputs. - -This module provides functions for: -- Comparing GIQL and bedtools results with appropriate tolerance -- Order-independent row sorting -- Epsilon-based float comparison -""" - -from typing import Any -from typing import List -from typing import Tuple - -from .data_models import ComparisonResult - - -def _sort_key(row: Tuple) -> Tuple: - """Generate sort key for order-independent comparison. - - Args: - row: Result row tuple - - Returns: - Sortable tuple (handles None values) - """ - # Convert None to empty string for sorting - return tuple("" if v is None else v for v in row) - - -def _values_match(val1: Any, val2: Any, epsilon: float = 1e-9) -> bool: - """Compare two values with appropriate tolerance. - - Args: - val1: First value - val2: Second value - epsilon: Tolerance for floating-point comparisons - - Returns: - True if values match within tolerance - """ - # Handle None values - if val1 is None and val2 is None: - return True - if val1 is None or val2 is None: - return False - - # Float comparison with epsilon - if isinstance(val1, float) or isinstance(val2, float): - try: - return abs(float(val1) - float(val2)) <= epsilon - except (ValueError, TypeError): - return False - - # Exact match for other types - return val1 == val2 - - -def compare_results( - giql_rows: List[Tuple], bedtools_rows: List[Tuple], epsilon: float = 1e-9 -) -> ComparisonResult: - """Compare GIQL and bedtools results with appropriate tolerance. - - Comparison rules: - - Integer positions/counts: exact match required - - Floating-point values: epsilon tolerance - - Row ordering: order-independent (sorts both result sets) - - Args: - giql_rows: Rows from GIQL query execution - bedtools_rows: Rows from bedtools output - epsilon: Tolerance for floating-point comparisons - - Returns: - ComparisonResult with match status and differences - """ - giql_count = len(giql_rows) - bedtools_count = len(bedtools_rows) - - # Sort both result sets for order-independent comparison - giql_sorted = sorted(giql_rows, key=_sort_key) - bedtools_sorted = sorted(bedtools_rows, key=_sort_key) - - differences = [] - - # Check row counts - if giql_count != bedtools_count: - differences.append( - f"Row count mismatch: GIQL has {giql_count} rows, " - f"bedtools has {bedtools_count} rows" - ) - - # Compare rows - max_rows = max(giql_count, bedtools_count) - for i in range(max_rows): - # Check if row exists in both - if i >= giql_count: - differences.append( - f"Row {i}: Missing in GIQL, present in bedtools: {bedtools_sorted[i]}" - ) - continue - if i >= bedtools_count: - differences.append( - f"Row {i}: Present in GIQL, missing in bedtools: {giql_sorted[i]}" - ) - continue - - giql_row = giql_sorted[i] - bedtools_row = bedtools_sorted[i] - - # Check column counts - if len(giql_row) != len(bedtools_row): - differences.append( - f"Row {i}: Column count mismatch " - f"(GIQL: {len(giql_row)} cols, bedtools: {len(bedtools_row)} cols)" - ) - continue - - # Compare each column - for col_idx, (giql_val, bedtools_val) in enumerate(zip(giql_row, bedtools_row)): - if not _values_match(giql_val, bedtools_val, epsilon): - differences.append( - f"Row {i}, col {col_idx}: " - f"GIQL={giql_val!r} != bedtools={bedtools_val!r}" - ) - - # Determine match status - match = len(differences) == 0 - - return ComparisonResult( - match=match, - giql_row_count=giql_count, - bedtools_row_count=bedtools_count, - differences=differences, - comparison_metadata={"epsilon": epsilon, "sorted": True}, - ) diff --git a/tests/integration/bedtools/utils/data_models.py b/tests/integration/bedtools/utils/data_models.py deleted file mode 100644 index dad0832..0000000 --- a/tests/integration/bedtools/utils/data_models.py +++ /dev/null @@ -1,259 +0,0 @@ -"""Data models for bedtools integration testing. - -This module defines the core data structures used throughout the test suite: -- GenomicInterval: Represents a single genomic interval -- SimulatedDataset: Collection of intervals for testing -- ComparisonResult: Result of comparing GIQL vs bedtools outputs -- IntervalGeneratorConfig: Configuration for dataset generation -- BedtoolsVersion: Bedtools version information -""" - -import re -from dataclasses import dataclass -from dataclasses import field -from pathlib import Path -from typing import Dict -from typing import List - - -@dataclass -class GenomicInterval: - """Represents a single genomic interval with all BED file fields. - - Attributes: - chrom: Chromosome name (e.g., "chr1", "chr2", "chrX") - start: Start position (0-based, inclusive) - end: End position (0-based, exclusive) - name: Optional interval name/identifier - score: Optional score value (0-1000) - strand: Optional strand ("+", "-", or ".") - """ - - chrom: str - start: int - end: int - name: str | None = None - score: int | None = None - strand: str | None = None - - def __post_init__(self): - """Validate interval fields.""" - if self.start >= self.end: - raise ValueError( - f"Invalid interval: start ({self.start}) >= end ({self.end})" - ) - if self.start < 0: - raise ValueError(f"Invalid interval: start ({self.start}) < 0") - if self.strand and self.strand not in ["+", "-", "."]: - raise ValueError(f"Invalid strand: {self.strand}") - if self.score is not None and not (0 <= self.score <= 1000): - raise ValueError(f"Invalid score: {self.score}") - - def to_bed_line(self, format="bed6") -> str: - """Convert to BED format line. - - Args: - format: Output format ('bed3' or 'bed6') - - Returns: - Tab-separated BED format string - """ - if format == "bed3": - return f"{self.chrom}\t{self.start}\t{self.end}" - elif format == "bed6": - name = self.name or "." - score = self.score if self.score is not None else 0 - strand = self.strand or "." - return f"{self.chrom}\t{self.start}\t{self.end}\t{name}\t{score}\t{strand}" - else: - raise ValueError(f"Unsupported BED format: {format}") - - -@dataclass -class SimulatedDataset: - """Collection of genomic intervals with controlled properties for testing. - - Attributes: - name: Dataset identifier (e.g., "intervals_a", "intervals_b") - intervals: List of genomic intervals - scenario_type: Scenario descriptor (e.g., "overlapping", "adjacent") - metadata: Generation parameters (seed, chromosome_count, etc.) - """ - - name: str - intervals: List[GenomicInterval] - scenario_type: str - metadata: dict = field(default_factory=dict) - - def __post_init__(self): - """Validate dataset has at least one interval.""" - if len(self.intervals) == 0: - raise ValueError("Dataset must contain at least one interval") - - def to_bed_file(self, path: Path, format="bed6"): - """Export to BED file. - - Args: - path: Output file path - format: BED format ('bed3' or 'bed6') - """ - with open(path, "w") as f: - for interval in self.intervals: - f.write(interval.to_bed_line(format) + "\n") - - def to_duckdb_table(self, conn, table_name: str): - """Load into DuckDB table. - - Args: - conn: DuckDB connection - table_name: Name of table to create - """ - rows = [ - (i.chrom, i.start, i.end, i.name, i.score, i.strand) for i in self.intervals - ] - conn.execute(f""" - CREATE TABLE {table_name} ( - chrom VARCHAR, - start INTEGER, - end INTEGER, - name VARCHAR, - score INTEGER, - strand VARCHAR - ) - """) - conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", rows) - - -@dataclass -class ComparisonResult: - """Result of comparing GIQL and bedtools outputs. - - Attributes: - match: Whether results match - giql_row_count: Number of rows from GIQL query - bedtools_row_count: Number of rows from bedtools output - differences: Specific differences found (if match=False) - comparison_metadata: Epsilon used, sort order, etc. - """ - - match: bool - giql_row_count: int - bedtools_row_count: int - differences: List[str] = field(default_factory=list) - comparison_metadata: dict = field(default_factory=dict) - - def __bool__(self) -> bool: - """Allow direct boolean evaluation in assertions.""" - return self.match - - def failure_message(self) -> str: - """Generate detailed failure message for test output. - - Returns: - Formatted failure message with differences - """ - if self.match: - return "✓ Results match" - - msg = [ - f"✗ Results do not match", - f" GIQL rows: {self.giql_row_count}", - f" Bedtools rows: {self.bedtools_row_count}", - ] - - if self.differences: - msg.append(" Differences:") - for diff in self.differences[:10]: # Limit to first 10 - msg.append(f" - {diff}") - if len(self.differences) > 10: - msg.append(f" ... and {len(self.differences) - 10} more") - - return "\n".join(msg) - - -@dataclass -class IntervalGeneratorConfig: - """Configuration for simulated dataset generation. - - Attributes: - chromosome_count: Number of chromosomes to generate - intervals_per_chromosome: Intervals per chromosome - min_interval_size: Minimum interval length - max_interval_size: Maximum interval length - overlap_probability: Probability of overlap (0.0-1.0) - strand_distribution: Proportions of +/-/. strands - seed: Random seed for reproducibility - """ - - chromosome_count: int = 3 - intervals_per_chromosome: int = 100 - min_interval_size: int = 100 - max_interval_size: int = 1000 - overlap_probability: float = 0.3 - strand_distribution: dict = field( - default_factory=lambda: {"+": 0.45, "-": 0.45, ".": 0.1} - ) - seed: int = 42 - - def __post_init__(self): - """Validate configuration parameters.""" - if self.chromosome_count <= 0: - raise ValueError("chromosome_count must be > 0") - if self.intervals_per_chromosome <= 0: - raise ValueError("intervals_per_chromosome must be > 0") - if self.min_interval_size < 1: - raise ValueError("min_interval_size must be >= 1") - if self.max_interval_size < self.min_interval_size: - raise ValueError("max_interval_size must be >= min_interval_size") - if not (0.0 <= self.overlap_probability <= 1.0): - raise ValueError("overlap_probability must be in [0.0, 1.0]") - if abs(sum(self.strand_distribution.values()) - 1.0) > 1e-6: - raise ValueError("strand_distribution must sum to 1.0") - - -@dataclass -class BedtoolsVersion: - """Represents bedtools version information. - - Attributes: - major: Major version number - minor: Minor version number - patch: Patch version number - raw_version_string: Original version string from bedtools - """ - - major: int - minor: int - patch: int - raw_version_string: str - - def is_compatible(self) -> bool: - """Check if version meets minimum requirement (2.30.0). - - Returns: - True if version >= 2.30.0 - """ - return (self.major, self.minor, self.patch) >= (2, 30, 0) - - def __str__(self) -> str: - """Return version as string.""" - return f"{self.major}.{self.minor}.{self.patch}" - - @classmethod - def from_string(cls, version_str: str) -> "BedtoolsVersion": - """Parse version from bedtools --version output. - - Args: - version_str: Version string from bedtools (e.g., "bedtools v2.30.0") - - Returns: - BedtoolsVersion instance - - Raises: - ValueError: If version string cannot be parsed - """ - match = re.search(r"v?(\d+)\.(\d+)\.(\d+)", version_str) - if not match: - raise ValueError(f"Could not parse version from: {version_str}") - major, minor, patch = map(int, match.groups()) - return cls(major, minor, patch, version_str) diff --git a/tests/integration/bedtools/utils/interval_generator.py b/tests/integration/bedtools/utils/interval_generator.py deleted file mode 100644 index 05df214..0000000 --- a/tests/integration/bedtools/utils/interval_generator.py +++ /dev/null @@ -1,425 +0,0 @@ -"""Interval generator for creating simulated genomic datasets. - -This module provides the IntervalGenerator class for creating test datasets -with controlled properties (overlap density, strand distribution, etc.). -""" - -import random -from typing import List -from typing import Tuple - -from .data_models import GenomicInterval -from .data_models import IntervalGeneratorConfig -from .data_models import SimulatedDataset - - -class IntervalGenerator: - """Generate simulated genomic intervals for testing. - - Provides methods for generating intervals with various patterns: - - Overlapping intervals - - Adjacent intervals - - Separated intervals - - Multi-chromosome datasets - - Strand-specific datasets - """ - - def __init__(self, config: IntervalGeneratorConfig | None = None): - """Initialize interval generator. - - Args: - config: Generator configuration (uses defaults if None) - """ - self.config = config or IntervalGeneratorConfig() - self.rng = random.Random(self.config.seed) - - def _choose_strand(self) -> str: - """Choose strand based on configured distribution. - - Returns: - Strand ('+', '-', or '.') - """ - r = self.rng.random() - cumulative = 0.0 - for strand, prob in self.config.strand_distribution.items(): - cumulative += prob - if r <= cumulative: - return strand - return "." # Fallback - - def _generate_interval_size(self) -> int: - """Generate random interval size within configured range. - - Returns: - Interval size in base pairs - """ - return self.rng.randint( - self.config.min_interval_size, self.config.max_interval_size - ) - - def generate_basic( - self, chromosome: str, count: int, max_position: int = 1000000 - ) -> List[GenomicInterval]: - """Generate basic random intervals. - - Args: - chromosome: Chromosome name - count: Number of intervals to generate - max_position: Maximum chromosome position - - Returns: - List of genomic intervals - """ - intervals = [] - for i in range(count): - size = self._generate_interval_size() - start = self.rng.randint(0, max_position - size) - end = start + size - strand = self._choose_strand() - - intervals.append( - GenomicInterval( - chrom=chromosome, - start=start, - end=end, - name=f"interval_{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - return intervals - - def generate_dataset( - self, - name: str, - scenario_type: str = "basic", - chromosome_count: int | None = None, - intervals_per_chrom: int | None = None, - ) -> SimulatedDataset: - """Generate a complete simulated dataset. - - Args: - name: Dataset identifier - scenario_type: Type of scenario ("basic", "overlapping", etc.) - chromosome_count: Number of chromosomes (uses config default if None) - intervals_per_chrom: Intervals per chromosome (uses config default if None) - - Returns: - SimulatedDataset with generated intervals - """ - chrom_count = chromosome_count or self.config.chromosome_count - interval_count = intervals_per_chrom or self.config.intervals_per_chromosome - - all_intervals = [] - for i in range(chrom_count): - chrom_name = f"chr{i + 1}" - intervals = self.generate_basic(chrom_name, interval_count) - all_intervals.extend(intervals) - - return SimulatedDataset( - name=name, - intervals=all_intervals, - scenario_type=scenario_type, - metadata={ - "chromosome_count": chrom_count, - "intervals_per_chromosome": interval_count, - "seed": self.config.seed, - "total_intervals": len(all_intervals), - }, - ) - - def generate_overlapping_scenarios( - self, chromosome: str, count: int, overlap_size: int = 50 - ) -> List[GenomicInterval]: - """Generate overlapping intervals with controlled overlap. - - Args: - chromosome: Chromosome name - count: Number of intervals to generate - overlap_size: Size of overlap between adjacent intervals - - Returns: - List of overlapping genomic intervals - """ - intervals = [] - base_size = self.config.min_interval_size - current_start = 100 - - for i in range(count): - start = current_start - end = start + base_size - strand = self._choose_strand() - - intervals.append( - GenomicInterval( - chrom=chromosome, - start=start, - end=end, - name=f"overlap_{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - # Next interval starts before current ends (creating overlap) - current_start = end - overlap_size - - return intervals - - def generate_adjacent_scenarios( - self, chromosome: str, count: int - ) -> List[GenomicInterval]: - """Generate adjacent intervals (touching but not overlapping). - - Args: - chromosome: Chromosome name - count: Number of intervals to generate - - Returns: - List of adjacent genomic intervals - """ - intervals = [] - base_size = self.config.min_interval_size - current_start = 100 - - for i in range(count): - start = current_start - end = start + base_size - strand = self._choose_strand() - - intervals.append( - GenomicInterval( - chrom=chromosome, - start=start, - end=end, - name=f"adjacent_{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - # Next interval starts exactly where current ends - current_start = end - - return intervals - - def generate_separated_scenarios( - self, chromosome: str, count: int, gap_size: int = 100 - ) -> List[GenomicInterval]: - """Generate separated intervals with gaps between them. - - Args: - chromosome: Chromosome name - count: Number of intervals to generate - gap_size: Size of gap between intervals - - Returns: - List of separated genomic intervals - """ - intervals = [] - base_size = self.config.min_interval_size - current_start = 100 - - for i in range(count): - start = current_start - end = start + base_size - strand = self._choose_strand() - - intervals.append( - GenomicInterval( - chrom=chromosome, - start=start, - end=end, - name=f"separated_{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - # Next interval starts after a gap - current_start = end + gap_size - - return intervals - - def generate_multi_chromosome_scenarios( - self, - chromosome_count: int, - intervals_per_chrom: int, - scenario_func: str = "basic", - ) -> List[GenomicInterval]: - """Generate intervals across multiple chromosomes. - - Args: - chromosome_count: Number of chromosomes - intervals_per_chrom: Number of intervals per chromosome - scenario_func: Scenario type ("basic", "overlapping", "adjacent", - "separated") - - Returns: - List of genomic intervals across multiple chromosomes - """ - all_intervals = [] - - for i in range(chromosome_count): - chrom_name = f"chr{i + 1}" - - if scenario_func == "overlapping": - intervals = self.generate_overlapping_scenarios( - chrom_name, intervals_per_chrom - ) - elif scenario_func == "adjacent": - intervals = self.generate_adjacent_scenarios( - chrom_name, intervals_per_chrom - ) - elif scenario_func == "separated": - intervals = self.generate_separated_scenarios( - chrom_name, intervals_per_chrom - ) - else: # basic - intervals = self.generate_basic(chrom_name, intervals_per_chrom) - - all_intervals.extend(intervals) - - return all_intervals - - def generate_same_strand_pairs( - self, chromosome: str, pair_count: int, strand: str = "+" - ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]: - """Generate two sets of intervals on the same strand. - - Args: - chromosome: Chromosome name - pair_count: Number of interval pairs to generate - strand: Strand to use for all intervals ('+' or '-') - - Returns: - Tuple of (intervals_a, intervals_b) on same strand - """ - intervals_a = [] - intervals_b = [] - base_size = self.config.min_interval_size - current_start = 100 - - for i in range(pair_count): - # Interval A - start_a = current_start - end_a = start_a + base_size - intervals_a.append( - GenomicInterval( - chrom=chromosome, - start=start_a, - end=end_a, - name=f"a{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - # Interval B - overlaps A, same strand - start_b = start_a + (base_size // 2) - end_b = start_b + base_size - intervals_b.append( - GenomicInterval( - chrom=chromosome, - start=start_b, - end=end_b, - name=f"b{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - # Move to next region - current_start = end_b + 100 - - return intervals_a, intervals_b - - def generate_opposite_strand_pairs( - self, chromosome: str, pair_count: int - ) -> Tuple[List[GenomicInterval], List[GenomicInterval]]: - """Generate two sets of intervals on opposite strands. - - Args: - chromosome: Chromosome name - pair_count: Number of interval pairs to generate - - Returns: - Tuple of (intervals_a, intervals_b) on opposite strands - """ - intervals_a = [] - intervals_b = [] - base_size = self.config.min_interval_size - current_start = 100 - - for i in range(pair_count): - # Interval A on + strand - start_a = current_start - end_a = start_a + base_size - intervals_a.append( - GenomicInterval( - chrom=chromosome, - start=start_a, - end=end_a, - name=f"a{i}", - score=self.rng.randint(0, 1000), - strand="+", - ) - ) - - # Interval B - overlaps A, opposite strand (-) - start_b = start_a + (base_size // 2) - end_b = start_b + base_size - intervals_b.append( - GenomicInterval( - chrom=chromosome, - start=start_b, - end=end_b, - name=f"b{i}", - score=self.rng.randint(0, 1000), - strand="-", - ) - ) - - # Move to next region - current_start = end_b + 100 - - return intervals_a, intervals_b - - def generate_mixed_strand_intervals( - self, chromosome: str, count: int - ) -> List[GenomicInterval]: - """Generate intervals with mixed strand assignments. - - Args: - chromosome: Chromosome name - count: Number of intervals to generate - - Returns: - List of intervals with randomly assigned strands (+, -, .) - """ - intervals = [] - base_size = self.config.min_interval_size - strands = ["+", "-", "."] - current_start = 100 - - for i in range(count): - start = current_start - end = start + base_size - # Randomly choose strand from +, -, . - strand = self.rng.choice(strands) - - intervals.append( - GenomicInterval( - chrom=chromosome, - start=start, - end=end, - name=f"mixed_{i}", - score=self.rng.randint(0, 1000), - strand=strand, - ) - ) - - current_start = end + 50 # Small gap - - return intervals diff --git a/tests/test_cluster.py b/tests/test_cluster.py deleted file mode 100644 index c359608..0000000 --- a/tests/test_cluster.py +++ /dev/null @@ -1,441 +0,0 @@ -"""Tests for CLUSTER and MERGE operations.""" - -import pytest - -from giql import GIQLEngine - - -@pytest.fixture -def cluster_test_data_csv(tmp_path): - """Create sample data for cluster testing.""" - csv_content = """ - id,chromosome,start_pos,end_pos,name - 1,chr1,100,200,f1 - 2,chr1,180,250,f2 - 3,chr1,250,500,f3 - 4,chr1,501,1000,f4 - 5,chr2,100,200,f5 - 6,chr2,300,400,f6 - """ - csv_path = tmp_path / "features.csv" - csv_path.write_text(csv_content.strip()) - return str(csv_path) - - -@pytest.fixture -def stranded_test_data_csv(tmp_path): - """Create stranded data for cluster testing.""" - csv_content = """ - id,chromosome,start_pos,end_pos,strand,name - 1,chr1,100,200,+,f1 - 2,chr1,180,250,+,f2 - 3,chr1,200,300,-,f3 - 4,chr1,250,350,-,f4 - 5,chr1,400,500,+,f5 - """ - csv_path = tmp_path / "stranded_features.csv" - csv_path.write_text(csv_content.strip()) - return str(csv_path) - - -@pytest.fixture -def duckdb_cluster_engine(cluster_test_data_csv): - """DuckDB engine with cluster test data loaded.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=True) - engine.load_csv("features", cluster_test_data_csv) - engine.register_table_schema( - "features", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) - yield engine - engine.close() - - -@pytest.fixture -def duckdb_stranded_engine(stranded_test_data_csv): - """DuckDB engine with stranded test data loaded.""" - engine = GIQLEngine(target_dialect="duckdb", verbose=True) - engine.load_csv("stranded_features", stranded_test_data_csv) - engine.register_table_schema( - "stranded_features", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", - "name": "VARCHAR", - }, - genomic_column="interval", - strand_col="strand", - ) - yield engine - engine.close() - - -class TestCluster: - """Tests for CLUSTER window function.""" - - def test_basic_cluster(self, duckdb_cluster_engine, to_df): - """Test basic CLUSTER operation.""" - result = to_df( - duckdb_cluster_engine.execute(""" - SELECT - id, - chromosome, - start_pos, - end_pos, - name, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) - ) - - # Expected clusters: - # chr1: features 1,2,3 are cluster 1 (overlapping/bookended) - # chr1: feature 4 is cluster 2 (gap at 501) - # chr2: feature 5 is cluster 1 - # chr2: feature 6 is cluster 2 (gap at 300) - - assert len(result) == 6 - - # Check chr1 clusters - chr1_results = result[result["chromosome"] == "chr1"] - assert chr1_results.iloc[0]["cluster_id"] == chr1_results.iloc[1]["cluster_id"] - assert chr1_results.iloc[1]["cluster_id"] == chr1_results.iloc[2]["cluster_id"] - assert chr1_results.iloc[2]["cluster_id"] != chr1_results.iloc[3]["cluster_id"] - - # Check chr2 clusters - chr2_results = result[result["chromosome"] == "chr2"] - assert chr2_results.iloc[0]["cluster_id"] != chr2_results.iloc[1]["cluster_id"] - - def test_cluster_with_distance(self, duckdb_cluster_engine, to_df): - """Test CLUSTER with distance parameter.""" - result = to_df( - duckdb_cluster_engine.execute(""" - SELECT - id, - chromosome, - start_pos, - end_pos, - name, - CLUSTER(interval, 100) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) - ) - - # With distance=100, chr1 features 1,2,3,4 should all be in same cluster - # (gap of 1bp at position 501 is within 100bp tolerance) - chr1_results = result[result["chromosome"] == "chr1"] - cluster_ids = chr1_results["cluster_id"].tolist() - assert len(set(cluster_ids)) == 1 # All in same cluster - - def test_stranded_cluster(self, duckdb_stranded_engine, to_df): - """Test CLUSTER with stranded=true.""" - result = to_df( - duckdb_stranded_engine.execute(""" - SELECT - id, - chromosome, - start_pos, - end_pos, - strand, - name, - CLUSTER(interval, stranded=true) AS cluster_id - FROM stranded_features - ORDER BY chromosome, start_pos - """) - ) - - # Features should cluster only within the same strand: - # + strand: f1,f2 overlap -> cluster 1, f5 is separate -> cluster 2 - # - strand: f3,f4 overlap -> cluster 1 - # Note: cluster_id numbering restarts for each partition (strand) - - assert len(result) == 5 - - # Extract features - f1 = result[result["id"] == 1].iloc[0] - f2 = result[result["id"] == 2].iloc[0] - f3 = result[result["id"] == 3].iloc[0] - f4 = result[result["id"] == 4].iloc[0] - f5 = result[result["id"] == 5].iloc[0] - - # Check that f1 and f2 (both +, overlapping) have same cluster_id - assert f1["cluster_id"] == f2["cluster_id"] - assert f1["strand"] == "+" - assert f2["strand"] == "+" - - # Check that f3 and f4 (both -, overlapping) have same cluster_id - assert f3["cluster_id"] == f4["cluster_id"] - assert f3["strand"] == "-" - assert f4["strand"] == "-" - - # Check that f5 (+ strand, separated) has different cluster from f1/f2 - assert f5["cluster_id"] != f1["cluster_id"] - assert f5["strand"] == "+" - - # Verify stranded clustering works: compare with non-stranded - result_nonstranded = to_df( - duckdb_stranded_engine.execute(""" - SELECT - id, - CLUSTER(interval) AS cluster_id - FROM stranded_features - ORDER BY id - """) - ) - - # Without stranded, f1-f4 should all be in same cluster (overlapping) - ns_f1 = result_nonstranded[result_nonstranded["id"] == 1].iloc[0] - ns_f2 = result_nonstranded[result_nonstranded["id"] == 2].iloc[0] - ns_f3 = result_nonstranded[result_nonstranded["id"] == 3].iloc[0] - ns_f4 = result_nonstranded[result_nonstranded["id"] == 4].iloc[0] - - assert ns_f1["cluster_id"] == ns_f2["cluster_id"] - assert ns_f2["cluster_id"] == ns_f3["cluster_id"] - assert ns_f3["cluster_id"] == ns_f4["cluster_id"] - - def test_cluster_in_cte(self, duckdb_cluster_engine, to_df): - """Test CLUSTER operation inside a CTE.""" - result = to_df( - duckdb_cluster_engine.execute(""" - WITH clustered_features AS ( - SELECT - id, - chromosome, - start_pos, - end_pos, - name, - CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT * - FROM clustered_features - WHERE cluster_id = 1 - ORDER BY chromosome, start_pos - """) - ) - - # Should return features in cluster 1 from each chromosome - assert len(result) > 0 - assert all("cluster_id" in row for _, row in result.iterrows()) - - def test_cluster_in_cte_with_aggregation(self, duckdb_cluster_engine, to_df): - """Test CLUSTER in CTE with aggregation in outer query.""" - result = to_df( - duckdb_cluster_engine.execute(""" - WITH clustered_features AS ( - SELECT - chromosome, - start_pos, - end_pos, - CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - chromosome, - cluster_id, - COUNT(*) as interval_count, - MIN(start_pos) as min_start, - MAX(end_pos) as max_end - FROM clustered_features - GROUP BY chromosome, cluster_id - ORDER BY chromosome, cluster_id - """) - ) - - # chr1 should have 2 clusters, chr2 should have 2 clusters - assert len(result) == 4 - - chr1_results = result[result["chromosome"] == "chr1"] - assert len(chr1_results) == 2 - # First cluster should have 3 intervals (f1, f2, f3) - assert chr1_results.iloc[0]["interval_count"] == 3 - # Second cluster should have 1 interval (f4) - assert chr1_results.iloc[1]["interval_count"] == 1 - - -class TestMerge: - """Tests for MERGE aggregate function.""" - - def test_basic_merge(self, duckdb_cluster_engine, to_df): - """Test basic MERGE operation.""" - result = to_df( - duckdb_cluster_engine.execute(""" - SELECT MERGE(interval) - FROM features - """) - ) - - # Expected merged intervals: - # chr1: features 1,2,3 merge into [100, 500] - # chr1: feature 4 stays as [501, 1000] - # chr2: feature 5 stays as [100, 200] - # chr2: feature 6 stays as [300, 400] - - assert len(result) == 4 - - # Check chr1 merged intervals - chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") - assert len(chr1_results) == 2 - assert chr1_results.iloc[0]["start_pos"] == 100 - assert chr1_results.iloc[0]["end_pos"] == 500 - assert chr1_results.iloc[1]["start_pos"] == 501 - assert chr1_results.iloc[1]["end_pos"] == 1000 - - # Check chr2 stays separate - chr2_results = result[result["chromosome"] == "chr2"].sort_values("start_pos") - assert len(chr2_results) == 2 - assert chr2_results.iloc[0]["start_pos"] == 100 - assert chr2_results.iloc[0]["end_pos"] == 200 - assert chr2_results.iloc[1]["start_pos"] == 300 - assert chr2_results.iloc[1]["end_pos"] == 400 - - def test_merge_with_distance(self, duckdb_cluster_engine, to_df): - """Test MERGE with distance parameter.""" - result = to_df( - duckdb_cluster_engine.execute(""" - SELECT MERGE(interval, 100) - FROM features - """) - ) - - # With distance=100, chr1 features 1-4 should merge into one interval - chr1_results = result[result["chromosome"] == "chr1"] - assert len(chr1_results) == 1 - assert chr1_results.iloc[0]["start_pos"] == 100 - assert chr1_results.iloc[0]["end_pos"] == 1000 - - def test_merge_with_aggregation(self, duckdb_cluster_engine, to_df): - """Test MERGE with additional aggregation columns.""" - result = to_df( - duckdb_cluster_engine.execute(""" - SELECT MERGE(interval), COUNT(*) as feature_count - FROM features - """) - ) - - # chr1 should have 2 merged intervals with counts - chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") - assert len(chr1_results) == 2 - assert chr1_results.iloc[0]["feature_count"] == 3 # f1, f2, f3 merged - assert chr1_results.iloc[1]["feature_count"] == 1 # f4 alone - - def test_stranded_merge(self, duckdb_stranded_engine, to_df): - """Test MERGE with stranded=true.""" - result = to_df( - duckdb_stranded_engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM stranded_features - """) - ) - - # + strand: f1,f2 merge -> [100,250], f5 stays -> [400,500] - # - strand: f3,f4 merge -> [200,350] - assert len(result) == 3 - - plus_strand = result[result["strand"] == "+"].sort_values("start_pos") - assert len(plus_strand) == 2 - assert plus_strand.iloc[0]["start_pos"] == 100 - assert plus_strand.iloc[0]["end_pos"] == 250 - assert plus_strand.iloc[1]["start_pos"] == 400 - assert plus_strand.iloc[1]["end_pos"] == 500 - - minus_strand = result[result["strand"] == "-"] - assert len(minus_strand) == 1 - assert minus_strand.iloc[0]["start_pos"] == 200 - assert minus_strand.iloc[0]["end_pos"] == 350 - - def test_merge_in_cte(self, duckdb_cluster_engine, to_df): - """Test MERGE operation inside a CTE.""" - result = to_df( - duckdb_cluster_engine.execute(""" - WITH merged_intervals AS ( - SELECT MERGE(interval) - FROM features - ) - SELECT * - FROM merged_intervals - ORDER BY chromosome, start_pos - """) - ) - - # Should have same results as basic merge - assert len(result) == 4 - - chr1_results = result[result["chromosome"] == "chr1"].sort_values("start_pos") - assert len(chr1_results) == 2 - assert chr1_results.iloc[0]["start_pos"] == 100 - assert chr1_results.iloc[0]["end_pos"] == 500 - - def test_merge_in_cte_with_aggregation_and_filter( - self, duckdb_cluster_engine, to_df - ): - """Test MERGE in CTE with aggregation and filtering in outer query.""" - result = to_df( - duckdb_cluster_engine.execute(""" - WITH merged_intervals AS ( - SELECT - MERGE(interval), - COUNT(*) as interval_count - FROM features - ) - SELECT * - FROM merged_intervals - WHERE interval_count > 1 - ORDER BY chromosome, start_pos - """) - ) - - # Only chr1's first merged interval has count > 1 (3 intervals merged) - assert len(result) == 1 - assert result.iloc[0]["chromosome"] == "chr1" - assert result.iloc[0]["start_pos"] == 100 - assert result.iloc[0]["end_pos"] == 500 - assert result.iloc[0]["interval_count"] == 3 - - def test_merge_in_cte_with_distance_and_aggregation( - self, duckdb_cluster_engine, to_df - ): - """Test MERGE with distance parameter in CTE with aggregation.""" - result = to_df( - duckdb_cluster_engine.execute(""" - WITH merged_intervals AS ( - SELECT - MERGE(interval, 100), - COUNT(*) as interval_count, - AVG(id) as avg_id - FROM features - ) - SELECT * - FROM merged_intervals - WHERE interval_count >= 2 - ORDER BY chromosome, start_pos - """) - ) - - # With distance=100, chr1 all 4 features merge, chr2 features also merge - # (gap between chr2 features is exactly 100bp) - assert len(result) == 2 - - # Check chr1 merged interval - chr1_result = result[result["chromosome"] == "chr1"].iloc[0] - assert chr1_result["interval_count"] == 4 - assert chr1_result["start_pos"] == 100 - assert chr1_result["end_pos"] == 1000 - - # Check chr2 merged interval - chr2_result = result[result["chromosome"] == "chr2"].iloc[0] - assert chr2_result["interval_count"] == 2 - assert chr2_result["start_pos"] == 100 - assert chr2_result["end_pos"] == 400 diff --git a/tests/test_distance_transpilation.py b/tests/test_distance_transpilation.py index 77405cc..77d434c 100644 --- a/tests/test_distance_transpilation.py +++ b/tests/test_distance_transpilation.py @@ -1,14 +1,12 @@ """Transpilation tests for DISTANCE operator SQL generation. -Tests verify that DISTANCE() is correctly transpiled to SQL CASE expressions -across different SQL dialects (DuckDB, SQLite, PostgreSQL). +Tests verify that DISTANCE() is correctly transpiled to SQL CASE expressions. """ from sqlglot import parse_one from giql.dialect import GIQLDialect from giql.generators import BaseGIQLGenerator -from giql.generators import GIQLDuckDBGenerator class TestDistanceTranspilation: @@ -26,7 +24,7 @@ def test_distance_transpilation_duckdb(self): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator() + generator = BaseGIQLGenerator() output = generator.generate(ast) expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" @@ -84,7 +82,7 @@ def test_distance_transpilation_signed_duckdb(self): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator() + generator = BaseGIQLGenerator() output = generator.generate(ast) # Signed distance: upstream (B before A) returns negative, diff --git a/tests/test_engine.py b/tests/test_engine.py deleted file mode 100644 index 2cff3a1..0000000 --- a/tests/test_engine.py +++ /dev/null @@ -1,480 +0,0 @@ -import tempfile - -from hypothesis import given -from hypothesis import settings -from hypothesis import strategies as st - -from giql import GIQLEngine - - -class TestGIQLEngine: - def test_engine_initialization_duckdb(self): - """ - GIVEN GIQLEngine with duckdb dialect - WHEN initializing engine - THEN should create connection successfully - """ - engine = GIQLEngine(target_dialect="duckdb") - assert engine.target_dialect == "duckdb" - assert engine.conn is not None - engine.close() - - def test_engine_initialization_sqlite(self): - """ - GIVEN GIQLEngine with sqlite dialect - WHEN initializing engine - THEN should create connection successfully - """ - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: - engine = GIQLEngine(target_dialect="sqlite", db_path=tmp.name) - assert engine.target_dialect == "sqlite" - assert engine.conn is not None - engine.close() - - def test_engine_context_manager(self): - """ - GIVEN GIQLEngine used as context manager - WHEN exiting context - THEN should close connection automatically - """ - with GIQLEngine() as engine: - assert engine.conn is not None - - def test_load_csv_and_query_duckdb(self, tmp_path, to_df): - """ - GIVEN CSV data loaded into DuckDB - WHEN executing GIQL query - THEN should return correct results - """ - # Create sample CSV - csv_content = """id,chromosome,start_pos,end_pos,ref,alt -1,chr1,1500,1600,A,T -2,chr1,10500,10600,G,C -3,chr2,500,600,C,G -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", str(csv_path)) - - # Query using INTERSECTS - cursor = engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - result = to_df(cursor) - - assert len(result) == 1 - assert result.iloc[0]["id"] == 1 - - def test_load_csv_and_query_sqlite(self, tmp_path, to_df): - """ - GIVEN CSV data loaded into SQLite - WHEN executing GIQL query - THEN should return correct results - """ - # Create sample CSV - csv_content = """id,chromosome,start_pos,end_pos,ref,alt -1,chr1,1500,1600,A,T -2,chr1,10500,10600,G,C -3,chr2,500,600,C,G -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="sqlite") as engine: - engine.load_csv("variants", str(csv_path)) - - # Query using INTERSECTS - result = to_df( - engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - ) - - assert len(result) == 1 - assert result.iloc[0]["id"] == 1 - - def test_intersects_any_query(self, tmp_path, to_df): - """ - GIVEN variants data - WHEN querying with INTERSECTS ANY - THEN should return variants overlapping any range - """ - csv_content = """id,chromosome,start_pos,end_pos -1,chr1,1500,1600 -2,chr1,10500,10600 -3,chr2,500,600 -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", str(csv_path)) - - result = to_df( - engine.execute( - "SELECT * FROM variants " - "WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:400-700')" - ) - ) - - assert len(result) == 2 - assert set(result["id"]) == {1, 3} - - def test_contains_query(self, tmp_path, to_df): - """ - GIVEN variants data - WHEN querying with CONTAINS - THEN should return variants containing the point - """ - csv_content = """id,chromosome,start_pos,end_pos -1,chr1,1500,1600 -2,chr1,10500,10600 -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", str(csv_path)) - - result = to_df( - engine.execute( - "SELECT * FROM variants WHERE interval CONTAINS 'chr1:1550'" - ) - ) - - assert len(result) == 1 - assert result.iloc[0]["id"] == 1 - - def test_within_query(self, tmp_path, to_df): - """ - GIVEN variants data - WHEN querying with WITHIN - THEN should return variants within the range - """ - csv_content = """id,chromosome,start_pos,end_pos -1,chr1,1500,1600 -2,chr1,10500,10600 -3,chr1,15000,15100 -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", str(csv_path)) - - result = to_df( - engine.execute( - "SELECT * FROM variants WHERE interval WITHIN 'chr1:1000-11000'" - ) - ) - - assert len(result) == 2 - assert set(result["id"]) == {1, 2} - - def test_verbose_mode(self, tmp_path, to_df): - """ - GIVEN engine with verbose mode - WHEN executing query - THEN should print transpiled SQL - """ - csv_content = """id,chromosome,start_pos,end_pos -1,chr1,1500,1600 -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - engine.load_csv("variants", str(csv_path)) - result = to_df( - engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - ) - assert len(result) == 1 - - @given( - chrom_col=st.sampled_from(["chromosome", "chr", "chrom", "contig", "seqname"]), - start_col=st.sampled_from(["start_pos", "start", "begin", "pos", "chromStart"]), - end_col=st.sampled_from(["end_pos", "end", "stop", "chromEnd"]), - strand_col=st.sampled_from(["strand", "str", "orientation", "direction"]), - ) - def test_custom_genomic_columns( - self, chrom_col, start_col, end_col, strand_col, to_df - ): - """ - GIVEN CSV data with custom genomic column names - WHEN registering schema with custom column mappings - THEN queries should work correctly with any valid column names - """ - # Create temporary directory and CSV with custom column names - with tempfile.TemporaryDirectory() as tmp_dir: - csv_content = f"""id,{chrom_col},{start_col},{end_col},{strand_col},name -1,chr1,1500,1600,+,variant1 -2,chr1,10500,10600,-,variant2 -3,chr2,500,600,+,variant3 -4,chr1,1400,1700,+,variant4 -""" - csv_path = f"{tmp_dir}/custom_variants.csv" - with open(csv_path, "w") as f: - f.write(csv_content) - - with GIQLEngine(target_dialect="duckdb", verbose=False) as engine: - engine.load_csv("variants", csv_path) - - # Register schema with custom column names - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - chrom_col: "VARCHAR", - start_col: "BIGINT", - end_col: "BIGINT", - strand_col: "VARCHAR", - "name": "VARCHAR", - }, - genomic_column="interval", - chrom_col=chrom_col, - start_col=start_col, - end_col=end_col, - strand_col=strand_col, - ) - - # Test INTERSECTS query - result = to_df( - engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - ) - assert len(result) == 2 - assert set(result["id"]) == {1, 4} - - # Test CLUSTER query (uses genomic columns internally) - result = to_df( - engine.execute( - "SELECT *, CLUSTER(interval) AS cluster_id FROM variants ORDER BY id" - ) - ) - assert len(result) == 4 - # Variants 1 and 4 should cluster together (overlapping on chr1) - assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"] - # Variant 2 should be in different cluster (no overlap) - assert result.iloc[1]["cluster_id"] != result.iloc[0]["cluster_id"] - - # Test stranded CLUSTER query - result = to_df( - engine.execute("""SELECT *, CLUSTER(interval, stranded=TRUE) AS cluster_id - FROM variants ORDER BY id""") - ) - assert len(result) == 4 - # With stranded=TRUE, variants 1 and 4 should cluster together (both + and overlapping) - assert result.iloc[0]["cluster_id"] == result.iloc[3]["cluster_id"] - # Note: cluster_ids are independent per (chromosome, strand) partition - # So variants on different strands CAN have the same cluster_id number - assert "cluster_id" in result.columns - - # Test MERGE query - result = to_df(engine.execute("SELECT MERGE(interval) FROM variants")) - # Should merge overlapping intervals - assert len(result) >= 1 - - @given( - # Table 1 (variants) column names - v_chrom_col=st.sampled_from(["chromosome", "chr", "chrom"]), - v_start_col=st.sampled_from(["start_pos", "start", "begin"]), - v_end_col=st.sampled_from(["end_pos", "end", "stop"]), - # Table 2 (features) column names (use different names to ensure they're distinct) - f_chrom_col=st.sampled_from(["seqname", "contig", "chr_name"]), - f_start_col=st.sampled_from(["pos", "chromStart", "feature_start"]), - f_end_col=st.sampled_from(["chromEnd", "feature_end", "terminus"]), - ) - @settings(deadline=None) - def test_join_with_different_schemas( - self, - v_chrom_col, - v_start_col, - v_end_col, - f_chrom_col, - f_start_col, - f_end_col, - to_df, - ): - """ - GIVEN two tables with different custom genomic column schemas - WHEN joining them using INTERSECTS - THEN queries should correctly use each table's custom column names - """ - with tempfile.TemporaryDirectory() as tmp_dir: - # Create variants table CSV - variants_csv = f"""id,{v_chrom_col},{v_start_col},{v_end_col},name -1,chr1,1500,1600,var1 -2,chr1,10500,10600,var2 -3,chr2,500,600,var3 -""" - variants_path = f"{tmp_dir}/variants.csv" - with open(variants_path, "w") as f: - f.write(variants_csv) - - # Create features table CSV with DIFFERENT column names - features_csv = f"""id,{f_chrom_col},{f_start_col},{f_end_col},type -1,chr1,1000,2000,exon -2,chr1,10000,11000,intron -3,chr2,400,700,promoter -""" - features_path = f"{tmp_dir}/features.csv" - with open(features_path, "w") as f: - f.write(features_csv) - - with GIQLEngine(target_dialect="duckdb", verbose=False) as engine: - # Load both tables - engine.load_csv("variants", variants_path) - engine.load_csv("features", features_path) - - # Register schemas with different column names - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - v_chrom_col: "VARCHAR", - v_start_col: "BIGINT", - v_end_col: "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - chrom_col=v_chrom_col, - start_col=v_start_col, - end_col=v_end_col, - ) - - engine.register_table_schema( - "features", - { - "id": "INTEGER", - f_chrom_col: "VARCHAR", - f_start_col: "BIGINT", - f_end_col: "BIGINT", - "type": "VARCHAR", - }, - genomic_column="region", - chrom_col=f_chrom_col, - start_col=f_start_col, - end_col=f_end_col, - ) - - # Test JOIN with INTERSECTS on both tables - result = to_df( - engine.execute(""" - SELECT v.name, f.type - FROM variants v - JOIN features f ON v.interval INTERSECTS f.region - ORDER BY v.id - """) - ) - - # Variant 1 (chr1:1500-1600) intersects Feature 1 (chr1:1000-2000) - # Variant 2 (chr1:10500-10600) intersects Feature 2 (chr1:10000-11000) - # Variant 3 (chr2:500-600) intersects Feature 3 (chr2:400-700) - assert len(result) == 3 - assert list(result["name"]) == ["var1", "var2", "var3"] - assert list(result["type"]) == ["exon", "intron", "promoter"] - - # Test LEFT JOIN to verify schema resolution works - result = to_df( - engine.execute(""" - SELECT v.id, v.name, f.type - FROM variants v - LEFT JOIN features f ON v.interval INTERSECTS f.region - WHERE v.id = 1 - """) - ) - assert len(result) == 1 - assert result.iloc[0]["name"] == "var1" - assert result.iloc[0]["type"] == "exon" - - # Test WHERE clause with INTERSECTS on specific table - result = to_df( - engine.execute(""" - SELECT v.id, v.name - FROM variants v, features f - WHERE v.interval INTERSECTS f.region - AND v.interval INTERSECTS 'chr1:1000-2000' - """) - ) - # Only variant 1 intersects both feature and the specified range - assert len(result) == 1 - assert result.iloc[0]["name"] == "var1" - - def test_transpile_returns_sql_string(self): - """ - GIVEN GIQLEngine with a GIQL query - WHEN calling transpile() - THEN should return SQL string without executing it - """ - with GIQLEngine(target_dialect="duckdb") as engine: - sql = engine.transpile( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - - assert isinstance(sql, str) - assert len(sql) > 0 - assert "SELECT" in sql.upper() - # Should contain genomic comparison logic - assert "chromosome" in sql or "start_pos" in sql or "end_pos" in sql - - def test_transpile_different_dialects(self): - """ - GIVEN GIQLEngine with different SQL dialects - WHEN calling transpile() - THEN should return SQL appropriate for each dialect - """ - query = "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - - for dialect in ["duckdb", "sqlite"]: - with GIQLEngine(target_dialect=dialect) as engine: - sql = engine.transpile(query) - assert isinstance(sql, str) - assert len(sql) > 0 - assert "SELECT" in sql.upper() - - def test_transpile_verbose_mode(self, tmp_path, capsys): - """ - GIVEN GIQLEngine with verbose mode enabled - WHEN calling transpile() - THEN should print transpilation details - """ - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - sql = engine.transpile( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - - captured = capsys.readouterr() - assert "Target Dialect: duckdb" in captured.out - assert "Original GIQL:" in captured.out - assert "Transpiled SQL:" in captured.out - assert isinstance(sql, str) - - def test_execute_uses_transpile(self, tmp_path, to_df): - """ - GIVEN GIQLEngine after refactoring - WHEN calling execute() - THEN should use transpile() internally and execute correctly - """ - csv_content = """id,chromosome,start_pos,end_pos -1,chr1,1500,1600 -2,chr1,10500,10600 -""" - csv_path = tmp_path / "variants.csv" - csv_path.write_text(csv_content) - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("variants", str(csv_path)) - - # execute() should internally call transpile() - cursor = engine.execute( - "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'" - ) - result = to_df(cursor) - - assert len(result) == 1 - assert result.iloc[0]["id"] == 1 diff --git a/tests/test_nearest_edge_cases.py b/tests/test_nearest_edge_cases.py deleted file mode 100644 index 31556da..0000000 --- a/tests/test_nearest_edge_cases.py +++ /dev/null @@ -1,633 +0,0 @@ -"""Edge case tests for NEAREST operator. - -Tests verify correct handling of boundary conditions, error cases, -and unusual inputs for the NEAREST operator. -""" - -import pytest -from hypothesis import assume -from hypothesis import given -from hypothesis import strategies as st - -from giql import GIQLEngine - - -@pytest.fixture -def duckdb_engine_with_edge_case_data(): - """Create DuckDB engine with data designed for edge case testing.""" - engine = GIQLEngine(target_dialect="duckdb") - - # Create peaks table - engine.conn.execute(""" - CREATE TABLE peaks ( - peak_id INTEGER, - chromosome VARCHAR, - start_pos INTEGER, - end_pos INTEGER - ) - """) - - # Create genes table - engine.conn.execute(""" - CREATE TABLE genes ( - gene_id INTEGER, - gene_name VARCHAR, - chromosome VARCHAR, - start_pos INTEGER, - end_pos INTEGER - ) - """) - - # Insert test data - # Peak 1: chr1:1000-1100 - # Peak 2: chr2:5000-5100 (different chromosome, no genes) - # Peak 3: chr1:10000-10100 - engine.conn.execute(""" - INSERT INTO peaks VALUES - (1, 'chr1', 1000, 1100), - (2, 'chr2', 5000, 5100), - (3, 'chr1', 10000, 10100) - """) - - # Genes with specific distance relationships - # GENE_A and GENE_B are both 500bp from Peak 1 (tie scenario) - # GENE_C overlaps Peak 1 (distance=0) - # GENE_D, GENE_E, GENE_F on chr1 but far from Peak 3 - engine.conn.execute(""" - INSERT INTO genes VALUES - (1, 'GENE_A', 'chr1', 1600, 1700), - (2, 'GENE_B', 'chr1', 400, 500), - (3, 'GENE_C', 'chr1', 1050, 1150), - (4, 'GENE_D', 'chr1', 10500, 10600), - (5, 'GENE_E', 'chr1', 11000, 11100), - (6, 'GENE_F', 'chr1', 12000, 12100) - """) - - # Register schema - engine.register_table_schema( - "peaks", - { - "peak_id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - engine.register_table_schema( - "genes", - { - "gene_id": "INTEGER", - "gene_name": "VARCHAR", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - - return engine - - -class TestNearestEdgeCases: - """Edge case tests for NEAREST operator.""" - - def test_k_equals_zero(self, duckdb_engine_with_edge_case_data): - """ - GIVEN a NEAREST query with k=0 - WHEN executing the query - THEN should return no results (LIMIT 0) - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=0) AS nearest - WHERE peaks.peak_id = 1 - """) - - rows = cursor.fetchall() - assert len(rows) == 0, "k=0 should return no results" - - def test_ties_multiple_features_same_distance( - self, duckdb_engine_with_edge_case_data - ): - """ - GIVEN multiple genes at the same distance from a peak - WHEN querying for k=1 nearest - THEN should return at least 1 result (behavior may vary for ties) - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - WHERE peaks.peak_id = 1 - ORDER BY nearest.distance, nearest.gene_name - """) - - rows = cursor.fetchall() - - # Should have at least 1 result - assert len(rows) >= 1, "Should return at least one result for k=1" - - # All results should be at the same distance (ties) - # Note: GENE_A and GENE_B are both 500bp away, GENE_C overlaps (0bp) - # So the closest should be GENE_C at distance 0 - assert rows[0][1] == "GENE_C", ( - f"Closest gene should be GENE_C (overlapping), got {rows[0][1]}" - ) - assert rows[0][2] == 0, f"Distance should be 0 (overlap), got {rows[0][2]}" - - def test_empty_result_set_different_chromosome( - self, duckdb_engine_with_edge_case_data - ): - """ - GIVEN a peak on a chromosome with no genes - WHEN querying for nearest genes - THEN should return empty result set - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest - WHERE peaks.peak_id = 2 - """) - - rows = cursor.fetchall() - - # Peak 2 is on chr2, but all genes are on chr1 - # Should return empty result set - assert len(rows) == 0, ( - "Should return empty result for peak on chromosome with no genes" - ) - - def test_overlapping_features_distance_zero(self, duckdb_engine_with_edge_case_data): - """ - GIVEN a gene that overlaps a peak - WHEN querying for nearest genes - THEN should return distance=0 for overlapping gene - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest - WHERE peaks.peak_id = 1 - ORDER BY nearest.distance - """) - - rows = cursor.fetchall() - - # GENE_C (chr1:1050-1150) overlaps Peak 1 (chr1:1000-1100) - assert len(rows) > 0, "Should find genes" - - # First result should be the overlapping gene with distance 0 - assert rows[0][1] == "GENE_C", ( - f"First result should be GENE_C (overlapping), got {rows[0][1]}" - ) - assert rows[0][2] == 0, ( - f"Distance should be 0 for overlapping gene, got {rows[0][2]}" - ) - - def test_missing_reference_in_standalone_mode( - self, duckdb_engine_with_edge_case_data - ): - """ - GIVEN a standalone NEAREST query without reference parameter - WHEN parsing/executing the query - THEN should raise an error (reference is required in standalone mode) - """ - engine = duckdb_engine_with_edge_case_data - - # Standalone mode (FROM NEAREST(...)) without reference parameter - # This should fail because we can't determine the reference point - with pytest.raises(Exception) as exc_info: - engine.execute(""" - SELECT * - FROM NEAREST(genes, k=3) - """) - - # Should get an error about missing reference - # The exact error message may vary, but it should mention reference - error_msg = str(exc_info.value).lower() - # Could be a ValueError, AttributeError, or SQL error depending on where it fails - # Just verify it fails - the specific error type will be improved in T065 - - def test_missing_target_table_in_schema(self, duckdb_engine_with_edge_case_data): - """ - GIVEN a NEAREST query referencing a non-existent table - WHEN executing the query - THEN should raise an error about missing table - """ - engine = duckdb_engine_with_edge_case_data - - # Query references 'nonexistent_table' which doesn't exist - with pytest.raises(Exception) as exc_info: - engine.execute(""" - SELECT * - FROM peaks - CROSS JOIN LATERAL NEAREST(nonexistent_table, reference=peaks.interval, k=3) AS nearest - """) - - # Should get an error about the missing table - error_msg = str(exc_info.value).lower() - # DuckDB should raise an error about the table not existing - - def test_invalid_literal_range_format(self, duckdb_engine_with_edge_case_data): - """ - GIVEN a NEAREST query with invalid literal range format - WHEN parsing/executing the query - THEN should raise an error about invalid range format - """ - engine = duckdb_engine_with_edge_case_data - - # Invalid range formats - # Note: "chr1:1000" is valid (point format), so not included - invalid_ranges = [ - "chr1:not-a-number", # Non-numeric coordinates - "invalid-format", # No colon separator - "chr1:2000-1000", # End before start (start >= end) - ] - - for invalid_range in invalid_ranges: - with pytest.raises(ValueError) as exc_info: - engine.execute(f""" - SELECT * - FROM NEAREST(genes, reference='{invalid_range}', k=3) - """) - - # Should get a ValueError about invalid range format - error_msg = str(exc_info.value).lower() - assert "invalid" in error_msg or "must be less" in error_msg, ( - f"Error message should mention invalid format or start/end issue: {exc_info.value}" - ) - - def test_nearest_with_additional_where_clause( - self, duckdb_engine_with_edge_case_data - ): - """ - GIVEN a NEAREST query with additional WHERE clause filtering - WHEN executing the query - THEN should apply both NEAREST and WHERE filters - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=10) AS nearest - WHERE peaks.peak_id = 1 AND nearest.distance < 600 - ORDER BY nearest.distance - """) - - rows = cursor.fetchall() - - # Should find genes within 600bp of Peak 1 - # GENE_C overlaps (0bp) and GENE_A/GENE_B are 500bp away - assert len(rows) >= 1, "Should find genes within 600bp" - - # All returned genes should have distance < 600 - for row in rows: - assert row[2] < 600, f"All distances should be < 600bp, got {row[2]}" - - def test_nearest_with_cte(self, duckdb_engine_with_edge_case_data): - """ - GIVEN a NEAREST query using a CTE for multiple query points - WHEN executing the query - THEN should correctly handle NEAREST within CTE - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - WITH selected_peaks AS ( - SELECT * FROM peaks WHERE peak_id IN (1, 3) - ) - SELECT - selected_peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM selected_peaks - CROSS JOIN LATERAL NEAREST(genes, reference=selected_peaks.interval, k=2) AS nearest - ORDER BY selected_peaks.peak_id, nearest.distance - """) - - rows = cursor.fetchall() - - # Should find 2 nearest genes for each of 2 peaks = up to 4 results - assert len(rows) > 0, "Should find genes for peaks in CTE" - - # Check that we have results for both peaks - peak_ids = set(row[0] for row in rows) - assert 1 in peak_ids, "Should have results for peak 1" - assert 3 in peak_ids, "Should have results for peak 3" - - def test_k_greater_than_total_features_all_chromosomes( - self, duckdb_engine_with_edge_case_data - ): - """ - GIVEN k greater than total number of features on the same chromosome - WHEN querying for nearest genes - THEN should return all available features on that chromosome - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1000) AS nearest - WHERE peaks.peak_id = 1 - """) - - rows = cursor.fetchall() - - # Peak 1 is on chr1, and there are 6 genes on chr1 - # Should return all 6 genes, not 1000 - assert len(rows) == 6, f"Should return all 6 genes on chr1, got {len(rows)}" - - def test_ties_with_k_greater_than_one(self, duckdb_engine_with_edge_case_data): - """ - GIVEN multiple features at the same distance (ties) - WHEN querying with k that includes tied features - THEN should handle ties consistently - """ - engine = duckdb_engine_with_edge_case_data - - cursor = engine.execute(""" - SELECT - peaks.peak_id, - nearest.gene_name, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - WHERE peaks.peak_id = 1 - ORDER BY nearest.distance, nearest.gene_name - """) - - rows = cursor.fetchall() - - # Peak 1 has: - # - GENE_C at 0bp (overlap) - # - GENE_A and GENE_B both at 500bp (tie) - # With k=3, should get all 3 - - assert len(rows) == 3, f"Should return 3 nearest genes, got {len(rows)}" - - # First should be GENE_C (distance 0) - assert rows[0][1] == "GENE_C" - assert rows[0][2] == 0 - - # Next two should be GENE_A and GENE_B (distance 500, order may vary) - gene_names_at_500 = [rows[1][1], rows[2][1]] - assert set(gene_names_at_500) == {"GENE_A", "GENE_B"}, ( - f"Should have GENE_A and GENE_B at 500bp" - ) - assert rows[1][2] == 500 - assert rows[2][2] == 500 - - -class TestNearestPropertyBased: - """Property-based tests for NEAREST operator using Hypothesis.""" - - @given( - start1=st.integers(min_value=0, max_value=100000), - length1=st.integers(min_value=1, max_value=1000), - start2=st.integers(min_value=0, max_value=100000), - length2=st.integers(min_value=1, max_value=1000), - ) - def test_distance_non_negative_for_non_overlapping( - self, start1, length1, start2, length2 - ): - """ - PROPERTY: Distance between non-overlapping intervals is always non-negative - GIVEN two non-overlapping genomic intervals - WHEN calculating distance using NEAREST - THEN distance should be >= 0 - """ - end1 = start1 + length1 - end2 = start2 + length2 - - # Skip if intervals overlap - assume(not (start1 < end2 and end1 > start2)) - - engine = GIQLEngine(target_dialect="duckdb") - - # Create tables - engine.conn.execute(""" - CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - engine.conn.execute(""" - CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - - # Insert test data - engine.conn.execute(f""" - INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1}) - """) - engine.conn.execute(f""" - INSERT INTO target VALUES (1, 'chr1', {start2}, {end2}) - """) - - # Register schema - engine.register_table_schema( - "ref", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - engine.register_table_schema( - "target", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - - # Query for nearest - cursor = engine.execute(""" - SELECT nearest.distance - FROM ref - CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest - """) - - rows = cursor.fetchall() - if len(rows) > 0: - distance = rows[0][0] - assert distance >= 0, f"Distance should be non-negative, got {distance}" - - @given( - start1=st.integers(min_value=0, max_value=100000), - length1=st.integers(min_value=1, max_value=1000), - overlap_start=st.integers(min_value=1, max_value=500), - ) - def test_overlapping_intervals_have_zero_distance( - self, start1, length1, overlap_start - ): - """ - PROPERTY: Overlapping intervals have distance 0 - GIVEN two genomic intervals that overlap - WHEN calculating distance using NEAREST - THEN distance should be 0 - """ - end1 = start1 + length1 - # Create overlapping interval - start2 = start1 + overlap_start - end2 = start2 + length1 - - # Ensure they actually overlap - assume(start1 < end2 and end1 > start2) - - engine = GIQLEngine(target_dialect="duckdb") - - # Create tables - engine.conn.execute(""" - CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - engine.conn.execute(""" - CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - - # Insert test data - engine.conn.execute(f""" - INSERT INTO ref VALUES (1, 'chr1', {start1}, {end1}) - """) - engine.conn.execute(f""" - INSERT INTO target VALUES (1, 'chr1', {start2}, {end2}) - """) - - # Register schema - engine.register_table_schema( - "ref", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - engine.register_table_schema( - "target", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - - # Query for nearest - cursor = engine.execute(""" - SELECT nearest.distance - FROM ref - CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k=1) AS nearest - """) - - rows = cursor.fetchall() - assert len(rows) > 0, "Should find overlapping interval" - distance = rows[0][0] - assert distance == 0, ( - f"Overlapping intervals should have distance 0, got {distance}" - ) - - @given( - k=st.integers(min_value=1, max_value=10), - n_features=st.integers(min_value=0, max_value=15), - ) - def test_k_parameter_returns_at_most_k_results(self, k, n_features): - """ - PROPERTY: k parameter limits results to at most k features - GIVEN k parameter and n available features - WHEN querying for k nearest - THEN should return min(k, n) results - """ - engine = GIQLEngine(target_dialect="duckdb") - - # Create tables - engine.conn.execute(""" - CREATE TABLE ref (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - engine.conn.execute(""" - CREATE TABLE target (id INTEGER, chromosome VARCHAR, start_pos INTEGER, end_pos INTEGER) - """) - - # Insert reference point - engine.conn.execute(""" - INSERT INTO ref VALUES (1, 'chr1', 1000, 1100) - """) - - # Insert n_features target features - for i in range(n_features): - # Spread features out to avoid ties - start = 2000 + (i * 500) - end = start + 100 - engine.conn.execute(f""" - INSERT INTO target VALUES ({i}, 'chr1', {start}, {end}) - """) - - # Register schema - engine.register_table_schema( - "ref", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - engine.register_table_schema( - "target", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "INTEGER", - "end_pos": "INTEGER", - }, - genomic_column="interval", - ) - - # Query for k nearest - cursor = engine.execute(f""" - SELECT COUNT(*) - FROM ref - CROSS JOIN LATERAL NEAREST(target, reference=ref.interval, k={k}) AS nearest - """) - - rows = cursor.fetchall() - count = rows[0][0] - - # Should return at most k results, but not more than available features - expected_count = min(k, n_features) - assert count == expected_count, ( - f"Expected {expected_count} results (min({k}, {n_features})), got {count}" - ) diff --git a/tests/test_nearest_transpilation.py b/tests/test_nearest_transpilation.py index 91618b6..b8c7d0e 100644 --- a/tests/test_nearest_transpilation.py +++ b/tests/test_nearest_transpilation.py @@ -1,64 +1,34 @@ """Transpilation tests for NEAREST operator SQL generation. -Tests verify that NEAREST() is correctly transpiled to dialect-specific SQL -(LATERAL joins for PostgreSQL/DuckDB, window functions for SQLite). +Tests verify that NEAREST() is correctly transpiled to SQL +(LATERAL joins for correlated queries, ORDER BY + LIMIT for standalone). """ import pytest from sqlglot import parse_one +from giql import Table from giql.dialect import GIQLDialect from giql.generators import BaseGIQLGenerator -from giql.generators import GIQLDuckDBGenerator -from giql.schema import ColumnInfo -from giql.schema import SchemaInfo -from giql.schema import TableSchema +from giql.table import Tables @pytest.fixture -def schema_with_peaks_and_genes(): - """Schema info with peaks and genes tables.""" - schema = SchemaInfo() +def tables_with_peaks_and_genes(): + """Tables container with peaks and genes tables.""" + tables = Tables() + tables.register("peaks", Table()) + tables.register("genes", Table()) + return tables - # Register peaks table - peaks_table = TableSchema(name="peaks", columns={}) - peaks_table.columns["peak_id"] = ColumnInfo(name="peak_id", type="INTEGER") - peaks_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["peaks"] = peaks_table - # Register genes table - genes_table = TableSchema(name="genes", columns={}) - genes_table.columns["gene_id"] = ColumnInfo(name="gene_id", type="INTEGER") - genes_table.columns["name"] = ColumnInfo(name="name", type="VARCHAR") - genes_table.columns["interval"] = ColumnInfo( - name="interval", - type="VARCHAR", - is_genomic=True, - chrom_col="chromosome", - start_col="start_pos", - end_col="end_pos", - strand_col="strand", - ) - schema.tables["genes"] = genes_table +class TestNearestTranspilation: + """Tests for NEAREST transpilation to SQL.""" - return schema - - -class TestNearestTranspilationDuckDB: - """Tests for NEAREST transpilation to DuckDB SQL (LATERAL joins).""" - - def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes): + def test_nearest_basic_k3(self, tables_with_peaks_and_genes): """ GIVEN a GIQL query with NEAREST(genes, k=3) - WHEN transpiling to DuckDB SQL + WHEN transpiling to SQL THEN should generate LATERAL join with DISTANCE and LIMIT 3 """ sql = """ @@ -68,7 +38,7 @@ def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Expectations: @@ -83,10 +53,10 @@ def test_nearest_basic_k3_duckdb(self, schema_with_peaks_and_genes): assert "LIMIT 3" in output assert "ORDER BY" in output - def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes): + def test_nearest_with_max_distance(self, tables_with_peaks_and_genes): """ GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000) - WHEN transpiling to DuckDB SQL + WHEN transpiling to SQL THEN should generate LATERAL join with distance filter """ sql = """ @@ -96,7 +66,7 @@ def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Expectations: @@ -107,10 +77,10 @@ def test_nearest_with_max_distance_duckdb(self, schema_with_peaks_and_genes): assert "100000" in output assert "LIMIT 5" in output - def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes): + def test_nearest_standalone_literal(self, tables_with_peaks_and_genes): """ GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3) - WHEN transpiling to DuckDB SQL + WHEN transpiling to SQL THEN should generate standalone query without LATERAL """ sql = """ @@ -119,7 +89,7 @@ def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Expectations: @@ -131,10 +101,10 @@ def test_nearest_standalone_literal_duckdb(self, schema_with_peaks_and_genes): assert "chr1" in output.lower() assert "LIMIT 3" in output - def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes): + def test_nearest_with_stranded(self, tables_with_peaks_and_genes): """ GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true) - WHEN transpiling to DuckDB SQL + WHEN transpiling to SQL THEN should generate SQL with strand filtering """ sql = """ @@ -144,7 +114,7 @@ def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Expectations: @@ -155,10 +125,10 @@ def test_nearest_with_stranded_duckdb(self, schema_with_peaks_and_genes): assert "strand" in output.lower() assert "LIMIT 3" in output - def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes): + def test_nearest_with_signed(self, tables_with_peaks_and_genes): """ GIVEN a GIQL query with NEAREST(genes, k=3, signed=true) - WHEN transpiling to DuckDB SQL + WHEN transpiling to SQL THEN should generate SQL with signed distance column (negative for upstream, positive for downstream) """ @@ -169,7 +139,7 @@ def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes): """ ast = parse_one(sql, dialect=GIQLDialect) - generator = GIQLDuckDBGenerator(schema_info=schema_with_peaks_and_genes) + generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) output = generator.generate(ast) # Expectations: @@ -183,114 +153,3 @@ def test_nearest_with_signed_duckdb(self, schema_with_peaks_and_genes): assert "ELSE -(" in output, ( f"Expected signed distance with negation for upstream, got:\n{output}" ) - - -# PostgreSQL uses same generator as base for now -# class TestNearestTranspilationPostgreSQL: -# """Tests for NEAREST transpilation to PostgreSQL SQL (LATERAL joins).""" -# (Skipped - uses BaseGIQLGenerator for now) - - -class TestNearestTranspilationSQLite: - """Tests for NEAREST transpilation to SQLite SQL (using LATERAL for MVP).""" - - def test_nearest_basic_k3_sqlite(self, schema_with_peaks_and_genes): - """ - GIVEN a GIQL query with NEAREST(genes, k=3) - WHEN transpiling to SQLite SQL - THEN should generate LATERAL subquery with ORDER BY and LIMIT - (Note: Using LATERAL for MVP - window function optimization to be added later) - """ - sql = """ - SELECT * - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) - """ - - ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) - output = generator.generate(ast) - - # MVP expectations (LATERAL syntax): - # - LATERAL subquery - # - Distance calculation (CASE WHEN) - # - ORDER BY distance - # - LIMIT 3 - assert "LATERAL" in output.upper() - assert "CASE" in output.upper() - assert " AS distance" in output or " AS DISTANCE" in output - assert "ORDER BY" in output.upper() - assert "LIMIT 3" in output - - def test_nearest_with_max_distance_sqlite(self, schema_with_peaks_and_genes): - """ - GIVEN a GIQL query with NEAREST(genes, k=5, max_distance=100000) - WHEN transpiling to SQLite SQL - THEN should generate LATERAL with distance filter - (Note: Using LATERAL for MVP - window function optimization to be added later) - """ - sql = """ - SELECT * - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5, max_distance=100000) - """ - - ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) - output = generator.generate(ast) - - # MVP expectations (LATERAL syntax): - # - LATERAL subquery - # - Distance filter: <= 100000 - # - LIMIT 5 - assert "LATERAL" in output.upper() - assert "100000" in output - assert "LIMIT 5" in output - - def test_nearest_standalone_literal_sqlite(self, schema_with_peaks_and_genes): - """ - GIVEN a GIQL query with literal reference NEAREST(genes, reference='chr1:1000-2000', k=3) - WHEN transpiling to SQLite SQL - THEN should generate standalone query without window functions - """ - sql = """ - SELECT * - FROM NEAREST(genes, reference='chr1:1000-2000', k=3) - """ - - ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) - output = generator.generate(ast) - - # Expectations: - # - No CTE needed (standalone mode) - # - Distance calculation with literal 'chr1', 1000, 2000 - # - ORDER BY distance - # - LIMIT 3 - assert "chr1" in output.lower() - assert "ORDER BY" in output.upper() - assert "LIMIT 3" in output - - def test_nearest_with_stranded_sqlite(self, schema_with_peaks_and_genes): - """ - GIVEN a GIQL query with NEAREST(genes, k=3, stranded=true) - WHEN transpiling to SQLite SQL - THEN should generate SQL with strand filtering - """ - sql = """ - SELECT * - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3, stranded=true) - """ - - ast = parse_one(sql, dialect=GIQLDialect) - generator = BaseGIQLGenerator(schema_info=schema_with_peaks_and_genes) - output = generator.generate(ast) - - # Expectations: - # - LATERAL subquery - # - Strand filtering in WHERE clause - # - LIMIT 3 - assert "LATERAL" in output.upper() - assert "strand" in output.lower() - assert "LIMIT 3" in output diff --git a/tests/test_transpile.py b/tests/test_transpile.py new file mode 100644 index 0000000..7cef9d7 --- /dev/null +++ b/tests/test_transpile.py @@ -0,0 +1,411 @@ +"""Tests for the transpile() function.""" + +import pytest + +import giql +from giql import Table +from giql import transpile + + +class TestTranspileBasic: + """Tests for basic transpilation with string table names.""" + + def test_transpile_intersects_literal(self): + """ + GIVEN a GIQL query with INTERSECTS and literal range + WHEN transpiling with string table name + THEN should return valid SQL with default column names + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "peaks" in sql + assert "chromosome" in sql + assert "start_pos" in sql + assert "end_pos" in sql + assert "chr1" in sql + + def test_transpile_contains_literal(self): + """ + GIVEN a GIQL query with CONTAINS and literal point + WHEN transpiling with string table name + THEN should return valid SQL for point containment + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval CONTAINS 'chr1:1500'", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "peaks" in sql + assert "1500" in sql + + def test_transpile_within_literal(self): + """ + GIVEN a GIQL query with WITHIN and literal range + WHEN transpiling with string table name + THEN should return valid SQL for interval within range + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval WITHIN 'chr1:1000-2000'", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "peaks" in sql + + def test_transpile_no_tables(self): + """ + GIVEN a GIQL query with INTERSECTS + WHEN transpiling with no tables parameter + THEN should return valid SQL with default column names + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + ) + + assert "SELECT" in sql + assert "peaks" in sql + + +class TestTranspileWithTableObjects: + """Tests for transpilation with Table objects.""" + + def test_transpile_custom_columns(self): + """ + GIVEN a GIQL query with INTERSECTS + WHEN transpiling with custom column mappings + THEN should use custom column names in generated SQL + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables={ + "peaks": Table( + genomic_col="interval", + chrom_col="chrom", + start_col="start", + end_col="end", + ) + }, + ) + + assert "SELECT" in sql + assert "peaks" in sql + assert '"chrom"' in sql + assert '"start"' in sql + assert '"end"' in sql + # Should NOT contain default column names + assert "chromosome" not in sql + assert "start_pos" not in sql + assert "end_pos" not in sql + + def test_transpile_no_strand_column(self): + """ + GIVEN a Table with strand_col=None + WHEN transpiling a query + THEN should not require strand column + """ + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables={ + "peaks": Table(strand_col=None) + }, + ) + + assert "SELECT" in sql + + +class TestTranspileMultipleTables: + """Tests for transpilation with multiple tables.""" + + def test_transpile_join_intersects(self): + """ + GIVEN a GIQL query joining two tables with INTERSECTS + WHEN transpiling with both tables configured + THEN should generate correct join conditions + """ + sql = transpile( + """ + SELECT a.*, b.* + FROM peaks a + JOIN genes b ON a.interval INTERSECTS b.region + """, + tables={ + "peaks": Table(genomic_col="interval"), + "genes": Table(genomic_col="region"), + }, + ) + + assert "SELECT" in sql + assert "peaks" in sql + assert "genes" in sql + assert "JOIN" in sql.upper() + + def test_transpile_different_schemas(self): + """ + GIVEN two tables with different column schemas + WHEN transpiling a join query + THEN should use correct columns for each table + """ + sql = transpile( + """ + SELECT a.*, b.* + FROM peaks a + JOIN features b ON a.interval INTERSECTS b.location + """, + tables={ + "peaks": Table( + genomic_col="interval", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + ), + "features": Table( + genomic_col="location", + chrom_col="seqname", + start_col="begin", + end_col="terminus", + ), + }, + ) + + assert "SELECT" in sql + # Both table's column names should appear + assert "chromosome" in sql or "start_pos" in sql + assert "seqname" in sql or "begin" in sql or "terminus" in sql + + +class TestTranspileSpatialOperators: + """Tests for all spatial operators.""" + + def test_intersects_any(self): + """ + GIVEN a GIQL query with INTERSECTS ANY + WHEN transpiling + THEN should generate OR conditions for multiple ranges + """ + sql = transpile( + """ + SELECT * FROM peaks + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:500-1000') + """, + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "chr1" in sql + assert "chr2" in sql + assert " OR " in sql + + def test_intersects_all(self): + """ + GIVEN a GIQL query with INTERSECTS ALL + WHEN transpiling + THEN should generate AND conditions for multiple ranges + """ + sql = transpile( + """ + SELECT * FROM peaks + WHERE interval INTERSECTS ALL('chr1:1000-2000', 'chr1:1500-2500') + """, + tables=["peaks"], + ) + + assert "SELECT" in sql + assert " AND " in sql + + +class TestTranspileCluster: + """Tests for CLUSTER operation.""" + + def test_cluster_basic(self): + """ + GIVEN a GIQL query with CLUSTER + WHEN transpiling + THEN should generate window function for clustering + """ + sql = transpile( + """ + SELECT *, CLUSTER(interval) AS cluster_id + FROM peaks + """, + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "SUM" in sql.upper() or "LAG" in sql.upper() + + def test_cluster_with_distance(self): + """ + GIVEN a GIQL query with CLUSTER and distance parameter + WHEN transpiling + THEN should include distance in clustering logic + """ + sql = transpile( + """ + SELECT *, CLUSTER(interval, 100) AS cluster_id + FROM peaks + """, + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "100" in sql + + def test_cluster_stranded(self): + """ + GIVEN a GIQL query with stranded CLUSTER + WHEN transpiling + THEN should partition by strand + """ + sql = transpile( + """ + SELECT *, CLUSTER(interval, stranded=true) AS cluster_id + FROM peaks + """, + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "strand" in sql.lower() + + +class TestTranspileMerge: + """Tests for MERGE operation.""" + + def test_merge_basic(self): + """ + GIVEN a GIQL query with MERGE + WHEN transpiling + THEN should generate GROUP BY with MIN/MAX aggregation + """ + sql = transpile( + "SELECT MERGE(interval) FROM peaks", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "MIN" in sql.upper() + assert "MAX" in sql.upper() + assert "GROUP BY" in sql.upper() + + def test_merge_with_distance(self): + """ + GIVEN a GIQL query with MERGE and distance parameter + WHEN transpiling + THEN should include distance in merge logic + """ + sql = transpile( + "SELECT MERGE(interval, 100) FROM peaks", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "100" in sql + + def test_merge_with_aggregation(self): + """ + GIVEN a GIQL query with MERGE and additional aggregation + WHEN transpiling + THEN should include both merge and custom aggregation + """ + sql = transpile( + "SELECT MERGE(interval), COUNT(*) as count FROM peaks", + tables=["peaks"], + ) + + assert "SELECT" in sql + assert "COUNT" in sql.upper() + + +class TestTranspileNearest: + """Tests for NEAREST operation.""" + + def test_nearest_standalone(self): + """ + GIVEN a GIQL query with standalone NEAREST + WHEN transpiling + THEN should generate subquery with ORDER BY and LIMIT + """ + sql = transpile( + "SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=3)", + tables=["genes"], + ) + + assert "SELECT" in sql + assert "ORDER BY" in sql.upper() + assert "LIMIT 3" in sql + + def test_nearest_with_max_distance(self): + """ + GIVEN a GIQL query with NEAREST and max_distance + WHEN transpiling + THEN should include distance filter + """ + sql = transpile( + """ + SELECT * FROM NEAREST(genes, reference='chr1:1000-2000', k=5, max_distance=100000) + """, + tables=["genes"], + ) + + assert "SELECT" in sql + assert "100000" in sql + assert "LIMIT 5" in sql + + def test_nearest_lateral(self): + """ + GIVEN a GIQL query with NEAREST in LATERAL join + WHEN transpiling + THEN should generate LATERAL subquery + """ + sql = transpile( + """ + SELECT * + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) + """, + tables=["peaks", "genes"], + ) + + assert "SELECT" in sql + assert "LATERAL" in sql.upper() + assert "LIMIT 3" in sql + + +class TestTranspileErrors: + """Tests for error handling.""" + + def test_invalid_syntax(self): + """ + GIVEN an invalid GIQL query + WHEN transpiling + THEN should raise ValueError with parse error + """ + with pytest.raises(ValueError, match="Parse error"): + transpile("SELECT * FORM peaks") # typo: FORM instead of FROM + + +class TestModuleExports: + """Tests for module-level exports.""" + + def test_transpile_exported(self): + """ + GIVEN the giql module + WHEN accessing transpile + THEN should be available at module level + """ + assert hasattr(giql, "transpile") + assert callable(giql.transpile) + + def test_table_exported(self): + """ + GIVEN the giql module + WHEN accessing Table + THEN should be available at module level + """ + assert hasattr(giql, "Table") + assert giql.Table is Table From 409d44ec8dfabf8021d8a66b73badbe985a3272e Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 9 Feb 2026 16:03:39 -0500 Subject: [PATCH 04/12] Simplify Table API and update default column names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add required `name` field to Table dataclass - Rename default columns: chromosome→chrom, start_pos→start, end_pos→end - Simplify `tables` parameter from `list[str] | dict[str, Table]` to `list[str | Table]` - Update README to reflect transpile-only API with usage examples --- README.md | 86 +++++---- src/giql/constants.py | 6 +- src/giql/table.py | 21 ++- src/giql/transpile.py | 34 ++-- tests/generators/test_base.py | 264 +++++++++++++-------------- tests/test_distance_transpilation.py | 14 +- tests/test_distance_udf.py | 52 +++--- tests/test_nearest_transpilation.py | 4 +- tests/test_transpile.py | 53 +++--- 9 files changed, 275 insertions(+), 259 deletions(-) diff --git a/README.md b/README.md index 3d84ec7..6b4d368 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,22 @@ # GIQL - Genomic Interval Query Language -A SQL dialect for genomic range queries with multi-database support. +A SQL dialect for genomic range queries. Transpiles to standard SQL. ## Overview -GIQL extends SQL with spatial operators for genomic interval queries. It transpiles to standard SQL that works across multiple database backends including DuckDB and SQLite. +GIQL extends SQL with spatial operators for genomic interval queries. It transpiles GIQL queries into standard SQL that can be executed on any database backend. -GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable across databases. +GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable. ## Features - **SQL-based**: Familiar SQL syntax with genomic extensions -- **Multi-backend**: Works with DuckDB, SQLite, and more - **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships - **Distance operators**: DISTANCE, NEAREST for proximity queries - **Aggregation operators**: CLUSTER, MERGE for combining intervals - **Set quantifiers**: ANY, ALL for multi-range queries -- **Transpilation**: Convert GIQL to standard SQL for debugging or external use +- **Transpilation**: Converts GIQL to standard SQL for execution on any backend ## Installation @@ -72,39 +71,50 @@ make html ## Quick Start ```python -from giql import GIQLEngine - -# Create engine with DuckDB backend -with GIQLEngine(target_dialect="duckdb") as engine: - # Load genomic data - engine.load_csv("variants", "variants.csv") - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Query with genomic operators (returns cursor for streaming) - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Process results lazily - for row in cursor: - print(row) - - # Or just transpile to SQL without executing - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - print(sql) # See the generated SQL +from giql import transpile + +# Transpile a GIQL query to standard SQL +sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], +) +print(sql) +``` + +With custom column mappings: + +```python +from giql import Table, transpile + +sql = transpile( + "SELECT * FROM variants WHERE position INTERSECTS 'chr1:1000-2000'", + tables=[ + Table( + "variants", + genomic_col="position", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + ) + ], +) +``` + +Execution example with DuckDB: + +```python +import duckdb +import oxbow as ox +from giql import transpile + +conn = duckdb.connect() +peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn) # streaming source + +sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], +) +df = con.execute(sql).fetchdf() ``` ## Operators at a Glance diff --git a/src/giql/constants.py b/src/giql/constants.py index daa5896..87f8055 100644 --- a/src/giql/constants.py +++ b/src/giql/constants.py @@ -4,8 +4,8 @@ """ # Default genomic column names -DEFAULT_CHROM_COL = "chromosome" -DEFAULT_START_COL = "start_pos" -DEFAULT_END_COL = "end_pos" +DEFAULT_CHROM_COL = "chrom" +DEFAULT_START_COL = "start" +DEFAULT_END_COL = "end" DEFAULT_STRAND_COL = "strand" DEFAULT_GENOMIC_COL = "interval" diff --git a/src/giql/table.py b/src/giql/table.py index 6899bf3..f23adaf 100644 --- a/src/giql/table.py +++ b/src/giql/table.py @@ -23,17 +23,19 @@ class Table: Parameters ---------- + name : str + The table name. genomic_col : str The pseudo-column name used in GIQL queries to reference the genomic interval (default: "interval"). chrom_col : str - The physical column name storing chromosome/contig (default: "chromosome"). + The physical column name storing chromosome/contig (default: "chrom"). start_col : str The physical column name storing interval start position - (default: "start_pos"). + (default: "start"). end_col : str The physical column name storing interval end position - (default: "end_pos"). + (default: "end"). strand_col : str | None The physical column name storing strand information, or None if the table has no strand column (default: "strand"). @@ -48,12 +50,14 @@ class Table: sql = transpile(query, tables=["peaks"]) - Using custom column names:: + Mixing default and custom table configurations:: sql = transpile( query, - tables={ - "variants": Table( + tables=[ + "peaks", + Table( + "variants", genomic_col="position", chrom_col="chr", start_col="pos_start", @@ -61,11 +65,12 @@ class Table: strand_col=None, # No strand column coordinate_system="1based", interval_type="closed", - ) - } + ), + ] ) """ + name: str genomic_col: str = DEFAULT_GENOMIC_COL chrom_col: str = DEFAULT_CHROM_COL start_col: str = DEFAULT_START_COL diff --git a/src/giql/transpile.py b/src/giql/transpile.py index 5271d06..f846834 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -14,14 +14,14 @@ from giql.transformer import MergeTransformer -def _build_tables(tables: list[str] | dict[str, Table] | None) -> Tables: +def _build_tables(tables: list[str | Table] | None) -> Tables: """Build a Tables container from table specifications. Parameters ---------- - tables : list[str] | dict[str, Table] | None + tables : list[str | Table] | None Table specifications. Strings use default column mappings. - Dict maps table names to Table configurations. + Table objects provide custom column mappings. Returns ------- @@ -33,19 +33,18 @@ def _build_tables(tables: list[str] | dict[str, Table] | None) -> Tables: if tables is None: return container - if isinstance(tables, dict): - for name, table in tables.items(): - container.register(name, table) - else: - for name in tables: - container.register(name, Table()) + for item in tables: + if isinstance(item, str): + container.register(item, Table(item)) + else: + container.register(item.name, item) return container def transpile( giql: str, - tables: list[str] | dict[str, Table] | None = None, + tables: list[str | Table] | None = None, ) -> str: """Transpile a GIQL query to SQL. @@ -57,10 +56,10 @@ def transpile( giql : str The GIQL query string containing genomic extensions like INTERSECTS, CONTAINS, WITHIN, CLUSTER, MERGE, or NEAREST. - tables : list[str] | dict[str, Table] | None - Table configurations. A list of strings uses default column mappings - (chromosome, start_pos, end_pos, strand). A dict maps table names - to Table objects for custom column name mappings. + tables : list[str | Table] | None + Table configurations. Strings use default column mappings + (chrom, start, end, strand). Table objects provide custom + column name mappings. Returns ------- @@ -85,14 +84,15 @@ def transpile( sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", - tables={ - "peaks": Table( + tables=[ + Table( + "peaks", genomic_col="interval", chrom_col="chrom", start_col="start", end_col="end", ) - } + ] ) """ # Build tables container diff --git a/tests/generators/test_base.py b/tests/generators/test_base.py index 195365b..bc169b4 100644 --- a/tests/generators/test_base.py +++ b/tests/generators/test_base.py @@ -22,7 +22,7 @@ def tables_info(): """Basic Tables with a single table containing genomic columns.""" tables = Tables() - tables.register("variants", Table()) + tables.register("variants", Table("variants")) return tables @@ -30,8 +30,8 @@ def tables_info(): def tables_with_two_tables(): """Tables with two tables for column-to-column tests.""" tables = Tables() - tables.register("features_a", Table()) - tables.register("features_b", Table()) + tables.register("features_a", Table("features_a")) + tables.register("features_b", Table("features_b")) return tables @@ -39,7 +39,7 @@ def tables_with_two_tables(): def tables_with_closed_intervals(): """Tables with CLOSED interval type for bedtools compatibility tests.""" tables = Tables() - tables.register("bed_features", Table(interval_type="closed")) + tables.register("bed_features", Table("bed_features", interval_type="closed")) return tables @@ -47,8 +47,8 @@ def tables_with_closed_intervals(): def tables_with_peaks_and_genes(): """Tables with peaks and genes tables for NEAREST tests.""" tables = Tables() - tables.register("peaks", Table()) - tables.register("genes", Table()) + tables.register("peaks", Table("peaks")) + tables.register("genes", Table("genes")) return tables @@ -119,8 +119,8 @@ def test_select_sql_with_alias(self, tables_info): expected = ( "SELECT * FROM variants AS v WHERE " - '(v."chromosome" = \'chr1\' AND v."start_pos" < 2000 ' - 'AND v."end_pos" > 1000)' + '(v."chrom" = \'chr1\' AND v."start" < 2000 ' + 'AND v."end" > 1000)' ) assert output == expected @@ -153,7 +153,7 @@ def test_intersects_sql_with_literal(self): expected = ( "SELECT * FROM variants WHERE " - '("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000)' + '("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000)' ) assert output == expected @@ -175,8 +175,8 @@ def test_intersects_sql_column_join(self, tables_with_two_tables): expected = ( "SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE " - '(a."chromosome" = b."chromosome" AND a."start_pos" < b."end_pos" ' - 'AND a."end_pos" > b."start_pos")' + '(a."chrom" = b."chrom" AND a."start" < b."end" ' + 'AND a."end" > b."start")' ) assert output == expected @@ -217,7 +217,7 @@ def test_contains_sql_point_query(self): expected = ( "SELECT * FROM variants WHERE " - '("chromosome" = \'chr1\' AND "start_pos" <= 1500 AND "end_pos" > 1500)' + '("chrom" = \'chr1\' AND "start" <= 1500 AND "end" > 1500)' ) assert output == expected @@ -235,8 +235,8 @@ def test_contains_sql_range_query(self): expected = ( "SELECT * FROM variants WHERE " - '("chromosome" = \'chr1\' AND "start_pos" <= 1500 ' - 'AND "end_pos" >= 2000)' + '("chrom" = \'chr1\' AND "start" <= 1500 ' + 'AND "end" >= 2000)' ) assert output == expected @@ -257,8 +257,8 @@ def test_contains_sql_column_join(self, tables_with_two_tables): expected = ( "SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE " - '(a."chromosome" = b."chromosome" ' - 'AND a."start_pos" <= b."start_pos" AND a."end_pos" >= b."end_pos")' + '(a."chrom" = b."chrom" ' + 'AND a."start" <= b."start" AND a."end" >= b."end")' ) assert output == expected @@ -302,7 +302,7 @@ def test_within_sql_with_literal(self): expected = ( "SELECT * FROM variants WHERE " - '("chromosome" = \'chr1\' AND "start_pos" >= 1000 AND "end_pos" <= 5000)' + '("chrom" = \'chr1\' AND "start" >= 1000 AND "end" <= 5000)' ) assert output == expected @@ -323,8 +323,8 @@ def test_within_sql_column_join(self, tables_with_two_tables): expected = ( "SELECT * FROM features_a AS a CROSS JOIN features_b AS b WHERE " - '(a."chromosome" = b."chromosome" ' - 'AND a."start_pos" >= b."start_pos" AND a."end_pos" <= b."end_pos")' + '(a."chrom" = b."chrom" ' + 'AND a."start" >= b."start" AND a."end" <= b."end")' ) assert output == expected @@ -345,8 +345,8 @@ def test_spatialsetpredicate_sql_any(self): expected = ( "SELECT * FROM variants WHERE " - '(("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000) ' - 'OR ("chromosome" = \'chr1\' AND "start_pos" < 6000 AND "end_pos" > 5000))' + '(("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000) ' + 'OR ("chrom" = \'chr1\' AND "start" < 6000 AND "end" > 5000))' ) assert output == expected @@ -367,8 +367,8 @@ def test_spatialsetpredicate_sql_all(self): expected = ( "SELECT * FROM variants WHERE " - '(("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000) ' - 'AND ("chromosome" = \'chr1\' AND "start_pos" < 1800 AND "end_pos" > 1500))' + '(("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000) ' + 'AND ("chrom" = \'chr1\' AND "start" < 1800 AND "end" > 1500))' ) assert output == expected @@ -387,19 +387,19 @@ def test_giqlnearest_sql_standalone(self, tables_with_peaks_and_genes): expected = ( "SELECT * FROM (\n" " SELECT genes.*, " - "CASE WHEN 'chr1' != genes.\"chromosome\" THEN NULL " - 'WHEN 1000 < genes."end_pos" AND 2000 > genes."start_pos" THEN 0 ' - 'WHEN 2000 <= genes."start_pos" ' - 'THEN (genes."start_pos" - 2000) ' - 'ELSE (1000 - genes."end_pos") END AS distance\n' + "CASE WHEN 'chr1' != genes.\"chrom\" THEN NULL " + 'WHEN 1000 < genes."end" AND 2000 > genes."start" THEN 0 ' + 'WHEN 2000 <= genes."start" ' + 'THEN (genes."start" - 2000) ' + 'ELSE (1000 - genes."end") END AS distance\n' " FROM genes\n" - " WHERE 'chr1' = genes.\"chromosome\"\n" + " WHERE 'chr1' = genes.\"chrom\"\n" " ORDER BY ABS(" - "CASE WHEN 'chr1' != genes.\"chromosome\" THEN NULL " - 'WHEN 1000 < genes."end_pos" AND 2000 > genes."start_pos" THEN 0 ' - 'WHEN 2000 <= genes."start_pos" ' - 'THEN (genes."start_pos" - 2000) ' - 'ELSE (1000 - genes."end_pos") END)\n' + "CASE WHEN 'chr1' != genes.\"chrom\" THEN NULL " + 'WHEN 1000 < genes."end" AND 2000 > genes."start" THEN 0 ' + 'WHEN 2000 <= genes."start" ' + 'THEN (genes."start" - 2000) ' + 'ELSE (1000 - genes."end") END)\n' " LIMIT 3\n" " )" ) @@ -423,21 +423,21 @@ def test_giqlnearest_sql_correlated(self, tables_with_peaks_and_genes): expected = ( "SELECT * FROM peaks CROSS JOIN LATERAL (\n" " SELECT genes.*, " - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END AS distance\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE (peaks."start" - genes."end") END AS distance\n' " FROM genes\n" - ' WHERE peaks."chromosome" = genes."chromosome"\n' + ' WHERE peaks."chrom" = genes."chrom"\n' " ORDER BY ABS(" - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END)\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE (peaks."start" - genes."end") END)\n' " LIMIT 3\n" " )" ) @@ -462,28 +462,28 @@ def test_giqlnearest_sql_with_max_distance(self, tables_with_peaks_and_genes): expected = ( "SELECT * FROM peaks CROSS JOIN LATERAL (\n" " SELECT genes.*, " - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END AS distance\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE (peaks."start" - genes."end") END AS distance\n' " FROM genes\n" - ' WHERE peaks."chromosome" = genes."chromosome" ' + ' WHERE peaks."chrom" = genes."chrom" ' "AND (ABS(" - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END)) <= 100000\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE (peaks."start" - genes."end") END)) <= 100000\n' " ORDER BY ABS(" - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END)\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE (peaks."start" - genes."end") END)\n' " LIMIT 5\n" " )" ) @@ -508,36 +508,36 @@ def test_giqlnearest_sql_stranded(self, tables_with_peaks_and_genes): expected = ( "SELECT * FROM peaks CROSS JOIN LATERAL (\n" " SELECT genes.*, " - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' 'WHEN peaks."strand" IS NULL OR genes."strand" IS NULL THEN NULL ' "WHEN peaks.\"strand\" = '.' OR peaks.\"strand\" = '?' THEN NULL " "WHEN genes.\"strand\" = '.' OR genes.\"strand\" = '?' THEN NULL " - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' "THEN CASE WHEN peaks.\"strand\" = '-' " - 'THEN -(genes."start_pos" - peaks."end_pos") ' - 'ELSE (genes."start_pos" - peaks."end_pos") END ' + 'THEN -(genes."start" - peaks."end") ' + 'ELSE (genes."start" - peaks."end") END ' "ELSE CASE WHEN peaks.\"strand\" = '-' " - 'THEN -(peaks."start_pos" - genes."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END END AS distance\n' + 'THEN -(peaks."start" - genes."end") ' + 'ELSE (peaks."start" - genes."end") END END AS distance\n' " FROM genes\n" - ' WHERE peaks."chromosome" = genes."chromosome" ' + ' WHERE peaks."chrom" = genes."chrom" ' 'AND peaks."strand" = genes."strand"\n' " ORDER BY ABS(" - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' 'WHEN peaks."strand" IS NULL OR genes."strand" IS NULL THEN NULL ' "WHEN peaks.\"strand\" = '.' OR peaks.\"strand\" = '?' THEN NULL " "WHEN genes.\"strand\" = '.' OR genes.\"strand\" = '?' THEN NULL " - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' "THEN CASE WHEN peaks.\"strand\" = '-' " - 'THEN -(genes."start_pos" - peaks."end_pos") ' - 'ELSE (genes."start_pos" - peaks."end_pos") END ' + 'THEN -(genes."start" - peaks."end") ' + 'ELSE (genes."start" - peaks."end") END ' "ELSE CASE WHEN peaks.\"strand\" = '-' " - 'THEN -(peaks."start_pos" - genes."end_pos") ' - 'ELSE (peaks."start_pos" - genes."end_pos") END END)\n' + 'THEN -(peaks."start" - genes."end") ' + 'ELSE (peaks."start" - genes."end") END END)\n' " LIMIT 3\n" " )" ) @@ -562,21 +562,21 @@ def test_giqlnearest_sql_signed(self, tables_with_peaks_and_genes): expected = ( "SELECT * FROM peaks CROSS JOIN LATERAL (\n" " SELECT genes.*, " - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE -(peaks."start_pos" - genes."end_pos") END AS distance\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE -(peaks."start" - genes."end") END AS distance\n' " FROM genes\n" - ' WHERE peaks."chromosome" = genes."chromosome"\n' + ' WHERE peaks."chrom" = genes."chrom"\n' " ORDER BY ABS(" - 'CASE WHEN peaks."chromosome" != genes."chromosome" THEN NULL ' - 'WHEN peaks."start_pos" < genes."end_pos" ' - 'AND peaks."end_pos" > genes."start_pos" THEN 0 ' - 'WHEN peaks."end_pos" <= genes."start_pos" ' - 'THEN (genes."start_pos" - peaks."end_pos") ' - 'ELSE -(peaks."start_pos" - genes."end_pos") END)\n' + 'CASE WHEN peaks."chrom" != genes."chrom" THEN NULL ' + 'WHEN peaks."start" < genes."end" ' + 'AND peaks."end" > genes."start" THEN 0 ' + 'WHEN peaks."end" <= genes."start" ' + 'THEN (genes."start" - peaks."end") ' + 'ELSE -(peaks."start" - genes."end") END)\n' " LIMIT 3\n" " )" ) @@ -645,11 +645,11 @@ def test_giqldistance_sql_basic(self, tables_with_two_tables): output = generator.generate(ast) expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' - 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" ' - 'THEN 0 WHEN a."end_pos" <= b."start_pos" ' - 'THEN (b."start_pos" - a."end_pos") ' - 'ELSE (a."start_pos" - b."end_pos") END AS dist ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' + 'WHEN a."start" < b."end" AND a."end" > b."start" ' + 'THEN 0 WHEN a."end" <= b."start" ' + 'THEN (b."start" - a."end") ' + 'ELSE (a."start" - b."end") END AS dist ' "FROM features_a AS a CROSS JOIN features_b AS b" ) assert output == expected @@ -670,19 +670,19 @@ def test_giqldistance_sql_stranded(self, tables_with_two_tables): output = generator.generate(ast) expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' 'WHEN a."strand" IS NULL OR b."strand" IS NULL THEN NULL ' "WHEN a.\"strand\" = '.' OR a.\"strand\" = '?' THEN NULL " "WHEN b.\"strand\" = '.' OR b.\"strand\" = '?' THEN NULL " - 'WHEN a."start_pos" < b."end_pos" ' - 'AND a."end_pos" > b."start_pos" THEN 0 ' - 'WHEN a."end_pos" <= b."start_pos" ' + 'WHEN a."start" < b."end" ' + 'AND a."end" > b."start" THEN 0 ' + 'WHEN a."end" <= b."start" ' "THEN CASE WHEN a.\"strand\" = '-' " - 'THEN -(b."start_pos" - a."end_pos") ' - 'ELSE (b."start_pos" - a."end_pos") END ' + 'THEN -(b."start" - a."end") ' + 'ELSE (b."start" - a."end") END ' "ELSE CASE WHEN a.\"strand\" = '-' " - 'THEN -(a."start_pos" - b."end_pos") ' - 'ELSE (a."start_pos" - b."end_pos") END END AS dist ' + 'THEN -(a."start" - b."end") ' + 'ELSE (a."start" - b."end") END END AS dist ' "FROM features_a AS a CROSS JOIN features_b AS b" ) assert output == expected @@ -703,11 +703,11 @@ def test_giqldistance_sql_signed(self, tables_with_two_tables): output = generator.generate(ast) expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' - 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" ' - 'THEN 0 WHEN a."end_pos" <= b."start_pos" ' - 'THEN (b."start_pos" - a."end_pos") ' - 'ELSE -(a."start_pos" - b."end_pos") END AS dist ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' + 'WHEN a."start" < b."end" AND a."end" > b."start" ' + 'THEN 0 WHEN a."end" <= b."start" ' + 'THEN (b."start" - a."end") ' + 'ELSE -(a."start" - b."end") END AS dist ' "FROM features_a AS a CROSS JOIN features_b AS b" ) assert output == expected @@ -729,19 +729,19 @@ def test_giqldistance_sql_stranded_and_signed(self, tables_with_two_tables): output = generator.generate(ast) expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' 'WHEN a."strand" IS NULL OR b."strand" IS NULL THEN NULL ' "WHEN a.\"strand\" = '.' OR a.\"strand\" = '?' THEN NULL " "WHEN b.\"strand\" = '.' OR b.\"strand\" = '?' THEN NULL " - 'WHEN a."start_pos" < b."end_pos" ' - 'AND a."end_pos" > b."start_pos" THEN 0 ' - 'WHEN a."end_pos" <= b."start_pos" ' + 'WHEN a."start" < b."end" ' + 'AND a."end" > b."start" THEN 0 ' + 'WHEN a."end" <= b."start" ' "THEN CASE WHEN a.\"strand\" = '-' " - 'THEN -(b."start_pos" - a."end_pos") ' - 'ELSE (b."start_pos" - a."end_pos") END ' + 'THEN -(b."start" - a."end") ' + 'ELSE (b."start" - a."end") END ' "ELSE CASE WHEN a.\"strand\" = '-' " - 'THEN (a."start_pos" - b."end_pos") ' - 'ELSE -(a."start_pos" - b."end_pos") END END AS dist ' + 'THEN (a."start" - b."end") ' + 'ELSE -(a."start" - b."end") END END AS dist ' "FROM features_a AS a CROSS JOIN features_b AS b" ) assert output == expected @@ -754,7 +754,7 @@ def test_giqldistance_with_closed_intervals(self, tables_with_closed_intervals): """ # Add a second table with closed intervals for distance calculation tables_with_closed_intervals.register( - "bed_features_b", Table(interval_type="closed") + "bed_features_b", Table("bed_features_b", interval_type="closed") ) sql = ( @@ -767,12 +767,12 @@ def test_giqldistance_with_closed_intervals(self, tables_with_closed_intervals): output = generator.generate(ast) expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' - 'WHEN a."start_pos" < b."end_pos" ' - 'AND a."end_pos" > b."start_pos" THEN 0 ' - 'WHEN a."end_pos" <= b."start_pos" ' - 'THEN (b."start_pos" - a."end_pos" + 1) ' - 'ELSE (a."start_pos" - b."end_pos" + 1) END AS dist ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' + 'WHEN a."start" < b."end" ' + 'AND a."end" > b."start" THEN 0 ' + 'WHEN a."end" <= b."start" ' + 'THEN (b."start" - a."end" + 1) ' + 'ELSE (a."start" - b."end" + 1) END AS dist ' "FROM bed_features AS a CROSS JOIN bed_features_b AS b" ) assert output == expected @@ -873,7 +873,7 @@ def test_giqlnearest_sql_closed_intervals(self): THEN Distance calculation includes +1 adjustment for bedtools compatibility. """ tables = Tables() - tables.register("genes_closed", Table(interval_type="closed")) + tables.register("genes_closed", Table("genes_closed", interval_type="closed")) sql = "SELECT * FROM NEAREST(genes_closed, reference='chr1:1000-2000', k=3)" ast = parse_one(sql, dialect=GIQLDialect) @@ -939,7 +939,7 @@ def test_giqlnearest_sql_outer_table_not_in_tables(self): THEN ValueError is raised listing the issue. """ tables = Tables() - tables.register("genes", Table()) + tables.register("genes", Table("genes")) nearest = GIQLNearest( this=exp.Table(this=exp.Identifier(this="genes")), @@ -1010,7 +1010,7 @@ def test_intersects_sql_unqualified_column(self): expected = ( "SELECT * FROM variants WHERE " - '("chromosome" = \'chr1\' AND "start_pos" < 2000 AND "end_pos" > 1000)' + '("chrom" = \'chr1\' AND "start" < 2000 AND "end" > 1000)' ) assert output == expected diff --git a/tests/test_distance_transpilation.py b/tests/test_distance_transpilation.py index 77d434c..7b79011 100644 --- a/tests/test_distance_transpilation.py +++ b/tests/test_distance_transpilation.py @@ -27,7 +27,7 @@ def test_distance_transpilation_duckdb(self): generator = BaseGIQLGenerator() output = generator.generate(ast) - expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" + expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" @@ -46,7 +46,7 @@ def test_distance_transpilation_sqlite(self): generator = BaseGIQLGenerator() output = generator.generate(ast) - expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a, features_b AS b""" + expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a, features_b AS b""" assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" @@ -65,7 +65,7 @@ def test_distance_transpilation_postgres(self): generator = BaseGIQLGenerator() output = generator.generate(ast) - expected = """SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ELSE (a."start_pos" - b."end_pos") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" + expected = """SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 WHEN a."end" <= b."start" THEN (b."start" - a."end") ELSE (a."start" - b."end") END AS dist FROM features_a AS a CROSS JOIN features_b AS b""" assert output == expected, f"Expected:\n{expected}\n\nGot:\n{output}" @@ -88,10 +88,10 @@ def test_distance_transpilation_signed_duckdb(self): # Signed distance: upstream (B before A) returns negative, # downstream (B after A) returns positive expected = ( - 'SELECT CASE WHEN a."chromosome" != b."chromosome" THEN NULL ' - 'WHEN a."start_pos" < b."end_pos" AND a."end_pos" > b."start_pos" THEN 0 ' - 'WHEN a."end_pos" <= b."start_pos" THEN (b."start_pos" - a."end_pos") ' - 'ELSE -(a."start_pos" - b."end_pos") END AS dist ' + 'SELECT CASE WHEN a."chrom" != b."chrom" THEN NULL ' + 'WHEN a."start" < b."end" AND a."end" > b."start" THEN 0 ' + 'WHEN a."end" <= b."start" THEN (b."start" - a."end") ' + 'ELSE -(a."start" - b."end") END AS dist ' "FROM features_a AS a CROSS JOIN features_b AS b" ) diff --git a/tests/test_distance_udf.py b/tests/test_distance_udf.py index 3048c33..ee8f624 100644 --- a/tests/test_distance_udf.py +++ b/tests/test_distance_udf.py @@ -26,9 +26,9 @@ def test_overlapping_intervals_return_zero(self): SELECT DISTANCE(a.interval, b.interval) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end) a CROSS JOIN - (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos) b + (SELECT 'chr1' as chrom, 150 as start, 250 as end) b """ # Parse and generate SQL @@ -64,9 +64,9 @@ def test_non_overlapping_intervals_return_positive_distance(self): SELECT DISTANCE(a.interval, b.interval) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -91,9 +91,9 @@ def test_different_chromosomes_return_null(self): SELECT DISTANCE(a.interval, b.interval) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end) a CROSS JOIN - (SELECT 'chr2' as chromosome, 150 as start_pos, 250 as end_pos) b + (SELECT 'chr2' as chrom, 150 as start, 250 as end) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -122,9 +122,9 @@ def test_adjacent_bookended_intervals_return_zero(self): SELECT DISTANCE(a.interval, b.interval) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end) a CROSS JOIN - (SELECT 'chr1' as chromosome, 200 as start_pos, 300 as end_pos) b + (SELECT 'chr1' as chrom, 200 as start, 300 as end) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -154,9 +154,9 @@ def test_zero_width_intervals_point_features(self): SELECT DISTANCE(a.interval, b.interval) as distance FROM - (SELECT 'chr1' as chromosome, 150 as start_pos, 150 as end_pos) a + (SELECT 'chr1' as chrom, 150 as start, 150 as end) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -185,9 +185,9 @@ def test_stranded_same_strand_plus(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '+' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -212,9 +212,9 @@ def test_stranded_same_strand_minus(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '-' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -239,9 +239,9 @@ def test_stranded_different_strands_calculates_distance(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '+' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '+' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '-' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '-' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -266,9 +266,9 @@ def test_stranded_different_strands_minus_first(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -293,9 +293,9 @@ def test_stranded_dot_strand_returns_null(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '.' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '.' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '.' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '.' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -320,9 +320,9 @@ def test_stranded_question_mark_strand_returns_null(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '?' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '?' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -347,9 +347,9 @@ def test_stranded_null_strand_returns_null(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, NULL as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, NULL as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 300 as start_pos, 400 as end_pos, '+' as strand) b + (SELECT 'chr1' as chrom, 300 as start, 400 as end, '+' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) @@ -374,9 +374,9 @@ def test_stranded_overlapping_intervals_minus_strand(self): SELECT DISTANCE(a.interval, b.interval, stranded=true) as distance FROM - (SELECT 'chr1' as chromosome, 100 as start_pos, 200 as end_pos, '-' as strand) a + (SELECT 'chr1' as chrom, 100 as start, 200 as end, '-' as strand) a CROSS JOIN - (SELECT 'chr1' as chromosome, 150 as start_pos, 250 as end_pos, '-' as strand) b + (SELECT 'chr1' as chrom, 150 as start, 250 as end, '-' as strand) b """ ast = parse_one(sql, dialect=GIQLDialect) diff --git a/tests/test_nearest_transpilation.py b/tests/test_nearest_transpilation.py index b8c7d0e..de57c98 100644 --- a/tests/test_nearest_transpilation.py +++ b/tests/test_nearest_transpilation.py @@ -17,8 +17,8 @@ def tables_with_peaks_and_genes(): """Tables container with peaks and genes tables.""" tables = Tables() - tables.register("peaks", Table()) - tables.register("genes", Table()) + tables.register("peaks", Table("peaks")) + tables.register("genes", Table("genes")) return tables diff --git a/tests/test_transpile.py b/tests/test_transpile.py index 7cef9d7..ea7ed8b 100644 --- a/tests/test_transpile.py +++ b/tests/test_transpile.py @@ -23,9 +23,9 @@ def test_transpile_intersects_literal(self): assert "SELECT" in sql assert "peaks" in sql - assert "chromosome" in sql - assert "start_pos" in sql - assert "end_pos" in sql + assert "chrom" in sql + assert "start" in sql + assert "end" in sql assert "chr1" in sql def test_transpile_contains_literal(self): @@ -82,25 +82,26 @@ def test_transpile_custom_columns(self): """ sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", - tables={ - "peaks": Table( + tables=[ + Table( + "peaks", genomic_col="interval", - chrom_col="chrom", - start_col="start", - end_col="end", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", ) - }, + ], ) assert "SELECT" in sql assert "peaks" in sql - assert '"chrom"' in sql - assert '"start"' in sql - assert '"end"' in sql + assert '"chromosome"' in sql + assert '"start_pos"' in sql + assert '"end_pos"' in sql # Should NOT contain default column names - assert "chromosome" not in sql - assert "start_pos" not in sql - assert "end_pos" not in sql + assert '"chrom"' not in sql + assert '"start"' not in sql + assert '"end"' not in sql def test_transpile_no_strand_column(self): """ @@ -110,9 +111,7 @@ def test_transpile_no_strand_column(self): """ sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", - tables={ - "peaks": Table(strand_col=None) - }, + tables=[Table("peaks", strand_col=None)], ) assert "SELECT" in sql @@ -133,10 +132,10 @@ def test_transpile_join_intersects(self): FROM peaks a JOIN genes b ON a.interval INTERSECTS b.region """, - tables={ - "peaks": Table(genomic_col="interval"), - "genes": Table(genomic_col="region"), - }, + tables=[ + Table("peaks", genomic_col="interval"), + Table("genes", genomic_col="region"), + ], ) assert "SELECT" in sql @@ -156,20 +155,22 @@ def test_transpile_different_schemas(self): FROM peaks a JOIN features b ON a.interval INTERSECTS b.location """, - tables={ - "peaks": Table( + tables=[ + Table( + "peaks", genomic_col="interval", chrom_col="chromosome", start_col="start_pos", end_col="end_pos", ), - "features": Table( + Table( + "features", genomic_col="location", chrom_col="seqname", start_col="begin", end_col="terminus", ), - }, + ], ) assert "SELECT" in sql From 5ca03c299dc78f9fb08fc96c01ac0fa0dd800a15 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:27:15 -0500 Subject: [PATCH 05/12] Major documentation rewrite and reorg. --- docs/api/index.rst | 12 - docs/conf.py | 3 +- .../aggregation-operators.rst | 226 ++++----- .../distance-operators.rst | 278 ++++++----- docs/{operators => dialect}/index.rst | 17 +- docs/{operators => dialect}/quantifiers.rst | 158 +++---- .../spatial-operators.rst | 102 ++-- .../syntax-reference.rst | 60 +-- docs/guides/engine.rst | 195 ++++++++ docs/guides/index.rst | 18 +- docs/guides/multi-backend.rst | 367 --------------- docs/guides/performance.rst | 297 ++++-------- docs/guides/quickstart.rst | 175 +++++++ docs/guides/schema-mapping.rst | 436 +++++------------- docs/guides/transpilation.rst | 417 ----------------- docs/index.rst | 151 ++---- docs/quickstart.rst | 228 --------- docs/recipes/advanced-queries.rst | 431 ++++++++--------- docs/recipes/bedtools-migration.rst | 398 +++++++--------- docs/recipes/clustering-queries.rst | 424 ++++++++--------- docs/recipes/distance-queries.rst | 396 ++++++++-------- docs/recipes/index.rst | 37 +- docs/recipes/intersect-queries.rst | 306 ++++++------ docs/transpilation/api-reference.rst | 13 + docs/transpilation/execution.rst | 152 ++++++ docs/transpilation/index.rst | 210 +++++++++ src/giql/__init__.py | 7 - 27 files changed, 2204 insertions(+), 3310 deletions(-) delete mode 100644 docs/api/index.rst rename docs/{operators => dialect}/aggregation-operators.rst (63%) rename docs/{operators => dialect}/distance-operators.rst (55%) rename docs/{operators => dialect}/index.rst (89%) rename docs/{operators => dialect}/quantifiers.rst (61%) rename docs/{operators => dialect}/spatial-operators.rst (75%) rename docs/{reference => dialect}/syntax-reference.rst (78%) create mode 100644 docs/guides/engine.rst delete mode 100644 docs/guides/multi-backend.rst create mode 100644 docs/guides/quickstart.rst delete mode 100644 docs/guides/transpilation.rst delete mode 100644 docs/quickstart.rst create mode 100644 docs/transpilation/api-reference.rst create mode 100644 docs/transpilation/execution.rst create mode 100644 docs/transpilation/index.rst diff --git a/docs/api/index.rst b/docs/api/index.rst deleted file mode 100644 index a17dc9e..0000000 --- a/docs/api/index.rst +++ /dev/null @@ -1,12 +0,0 @@ -API Reference -============= - -This section documents the GIQL Python API. - -.. toctree:: - :maxdepth: 2 - -.. automodule:: giql - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 1d38676..9a28ad8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,6 +25,7 @@ "sphinx.ext.viewcode", "sphinx.ext.intersphinx", "sphinx.ext.autosummary", + "sphinx_design", ] # Napoleon settings @@ -69,5 +70,5 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "sphinx_rtd_theme" +html_theme = "sphinx_book_theme" # html_static_path = ['_static'] # Uncomment when you have custom static files diff --git a/docs/operators/aggregation-operators.rst b/docs/dialect/aggregation-operators.rst similarity index 63% rename from docs/operators/aggregation-operators.rst rename to docs/dialect/aggregation-operators.rst index 50d10da..cc3d5ec 100644 --- a/docs/operators/aggregation-operators.rst +++ b/docs/dialect/aggregation-operators.rst @@ -1,5 +1,5 @@ -Aggregation Operators -===================== +Aggregation +=========== Aggregation operators combine and cluster genomic intervals. These operators are essential for reducing complex interval data into summarized regions, such as @@ -7,7 +7,7 @@ merging overlapping peaks or identifying clusters of related features. .. contents:: :local: - :depth: 2 + :depth: 1 .. _cluster-operator: @@ -51,7 +51,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **distance** *(optional)* Maximum gap between intervals to consider them part of the same cluster. @@ -73,91 +73,81 @@ Examples Assign cluster IDs to overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start **Distance-Based Clustering:** Cluster intervals within 1000bp of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start **Strand-Specific Clustering:** Cluster intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Analyze Cluster Statistics:** Count features per cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ) + WITH clustered AS ( SELECT - chromosome, - cluster_id, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end - FROM clustered - GROUP BY chromosome, cluster_id - ORDER BY chromosome, cluster_start - """) + *, + CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + chrom, + cluster_id, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end + FROM clustered + GROUP BY chrom, cluster_id + ORDER BY chrom, cluster_start **Filter by Cluster Size:** Find regions with multiple overlapping features: -.. code-block:: python - - cursor = engine.execute(""" - WITH clustered AS ( - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ), - cluster_sizes AS ( - SELECT cluster_id, COUNT(*) AS size - FROM clustered - GROUP BY cluster_id - ) - SELECT c.* - FROM clustered c - INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id - WHERE s.size >= 3 - """) +.. code-block:: sql + + WITH clustered AS ( + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + INNER JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -239,7 +229,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **distance** *(optional)* Maximum gap between intervals to merge. Default: ``0`` (only overlapping @@ -253,9 +243,9 @@ Return Value Returns merged interval coordinates: -- ``chromosome`` - Chromosome of the merged region -- ``start_pos`` - Start position of the merged region -- ``end_pos`` - End position of the merged region +- ``chrom`` - Chromosome of the merged region +- ``start`` - Start position of the merged region +- ``end`` - End position of the merged region - ``strand`` - Strand (if ``stranded=true``) Examples @@ -265,108 +255,92 @@ Examples Merge all overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features - # Returns: chromosome, start_pos, end_pos for each merged region + -- Returns: chrom, start, end for each merged region **Distance-Based Merge:** Merge intervals within 1000bp of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features **Strand-Specific Merge:** Merge intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features **Merge with Feature Count:** Count how many features were merged into each region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features **Merge with Aggregations:** Calculate statistics for merged regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count, - AVG(score) AS avg_score, - MAX(score) AS max_score - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM features **Collect Merged Feature Names:** List the names of features that were merged: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS feature_names - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features **Merge by Chromosome:** Process each chromosome separately (explicit grouping): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - chromosome, - MERGE(interval), - COUNT(*) AS feature_count - FROM features - GROUP BY chromosome - ORDER BY chromosome - """) + SELECT + chrom, + MERGE(interval), + COUNT(*) AS feature_count + FROM features + GROUP BY chrom + ORDER BY chrom **Calculate Total Coverage:** Calculate the total base pairs covered after merging: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) AS merged_pos - FROM features - ) - SELECT SUM(end_pos - start_pos) AS total_coverage - FROM merged - """) + WITH merged AS ( + SELECT MERGE(interval) AS merged_pos + FROM features + ) + SELECT SUM(end - start) AS total_coverage + FROM merged Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/operators/distance-operators.rst b/docs/dialect/distance-operators.rst similarity index 55% rename from docs/operators/distance-operators.rst rename to docs/dialect/distance-operators.rst index 7ceccf3..216bdcb 100644 --- a/docs/operators/distance-operators.rst +++ b/docs/dialect/distance-operators.rst @@ -1,5 +1,5 @@ -Distance and Proximity Operators -================================ +Distance and Proximity +====================== Distance and proximity operators calculate genomic distances and find nearest features. These operators are essential for proximity analysis, such as finding genes near @@ -7,7 +7,7 @@ regulatory elements or variants near transcription start sites. .. contents:: :local: - :depth: 2 + :depth: 1 .. _distance-operator: @@ -37,7 +37,7 @@ Parameters ~~~~~~~~~~ **interval_a** - A genomic column registered with the engine. + A genomic column. **interval_b** Another genomic column to measure distance to. @@ -56,52 +56,46 @@ Examples Calculate distance between peaks and genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - p.name AS peak, - g.name AS gene, - DISTANCE(p.interval, g.interval) AS distance - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - ORDER BY p.name, distance - """) + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS distance + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom + ORDER BY p.name, distance **Filter by Distance:** Find features within 10kb of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - AND DISTANCE(a.interval, b.interval) <= 10000 - """) + SELECT a.name, b.name, DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + AND DISTANCE(a.interval, b.interval) <= 10000 **Identify Overlapping vs. Proximal:** Distinguish between overlapping and nearby features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - p.name, - g.name, - CASE - WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' - WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal' - ELSE 'distant' - END AS relationship - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - """) +.. code-block:: sql + + SELECT + p.name, + g.name, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -126,7 +120,7 @@ Backend Compatibility Performance Notes ~~~~~~~~~~~~~~~~~ -- Always include ``WHERE a.chromosome = b.chromosome`` to avoid unnecessary +- Always include ``WHERE a.chrom = b.chrom`` to avoid unnecessary cross-chromosome comparisons - For large datasets, consider pre-filtering by region before calculating distances - Create indexes on chromosome and position columns for better performance @@ -219,136 +213,124 @@ Examples Find the 3 nearest genes for each peak: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance **Standalone Query:** Find 5 nearest genes to a specific genomic location: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT gene_name, distance - FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) - ORDER BY distance - """) + SELECT gene_name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance **Distance-Constrained Search:** Find nearest features within 100kb: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance **Strand-Specific Nearest Neighbors:** Find nearest same-strand features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.strand, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance **Directional (Upstream/Downstream) Queries:** Find upstream features using signed distances: -.. code-block:: python - - # Upstream features have negative distances - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance < 0 - ORDER BY peaks.name, nearest.distance DESC - """) - - # Downstream features have positive distances - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance > 0 - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + -- Upstream features have negative distances + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC + +.. code-block:: sql + + -- Downstream features have positive distances + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance **Combined Parameters:** Find nearby same-strand features within distance constraints: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=50000, - stranded=true, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -10000 AND 10000 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -376,15 +358,13 @@ Performance Notes - **Chromosome pre-filtering**: NEAREST automatically filters by chromosome for efficiency - **Use max_distance**: Specifying a maximum distance reduces the search space significantly - **Limit k**: Only request as many neighbors as you actually need -- **Create indexes**: Add indexes on ``(chromosome, start_pos, end_pos)`` for better performance +- **Create indexes**: Add indexes on ``(chrom, start, "end")`` for better performance -.. code-block:: python +.. code-block:: sql - # Create indexes for better NEAREST performance - engine.conn.execute(""" - CREATE INDEX idx_genes_position - ON genes (chromosome, start_pos, end_pos) - """) + -- Create indexes for better NEAREST performance + CREATE INDEX idx_genes_position + ON genes (chrom, start, "end") Related Operators ~~~~~~~~~~~~~~~~~ diff --git a/docs/operators/index.rst b/docs/dialect/index.rst similarity index 89% rename from docs/operators/index.rst rename to docs/dialect/index.rst index ce24f17..48e7bb2 100644 --- a/docs/operators/index.rst +++ b/docs/dialect/index.rst @@ -1,15 +1,13 @@ -GIQL Operators -============== +Operators +========= GIQL extends SQL with operators specifically designed for genomic interval queries. These operators enable powerful spatial reasoning over genomic coordinates without requiring complex SQL expressions. -Operators are organized by functionality: - -.. contents:: - :local: - :depth: 1 +Operators are organized by functionality. All operators work across supported +database backends (DuckDB, SQLite, with PostgreSQL planned). Each operator page +includes a compatibility table showing backend support status. Spatial Relationship Operators ------------------------------ @@ -98,11 +96,6 @@ Apply operators to multiple ranges simultaneously. See :doc:`quantifiers` for detailed documentation. -Operator Compatibility ----------------------- - -All operators work across supported database backends (DuckDB, SQLite, with PostgreSQL planned). -Each operator page includes a compatibility table showing backend support status. .. toctree:: :maxdepth: 2 diff --git a/docs/operators/quantifiers.rst b/docs/dialect/quantifiers.rst similarity index 61% rename from docs/operators/quantifiers.rst rename to docs/dialect/quantifiers.rst index cffb71d..b10a38b 100644 --- a/docs/operators/quantifiers.rst +++ b/docs/dialect/quantifiers.rst @@ -7,7 +7,7 @@ specified ranges in a single query. .. contents:: :local: - :depth: 2 + :depth: 1 .. _any-quantifier: @@ -47,7 +47,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **ranges** A comma-separated list of genomic range literals. @@ -65,60 +65,52 @@ Examples Find variants in any of several regions of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY( - 'chr1:1000-2000', - 'chr1:5000-6000', - 'chr2:1000-3000' - ) - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000-2000', + 'chr1:5000-6000', + 'chr2:1000-3000' + ) **Check Against Gene Promoters:** Find features overlapping any of a set of promoter regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE interval INTERSECTS ANY( - 'chr1:11869-12869', -- Gene A promoter - 'chr1:29554-30554', -- Gene B promoter - 'chr1:69091-70091' -- Gene C promoter - ) - """) + SELECT * FROM peaks + WHERE interval INTERSECTS ANY( + 'chr1:11869-12869', -- Gene A promoter + 'chr1:29554-30554', -- Gene B promoter + 'chr1:69091-70091' -- Gene C promoter + ) **Combine with Other Filters:** Filter by multiple regions and additional criteria: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') - AND quality >= 30 - AND filter = 'PASS' - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + AND quality >= 30 + AND filter = 'PASS' **Multi-Chromosome Query:** Query across different chromosomes efficiently: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY( - 'chr1:100000-200000', - 'chr2:100000-200000', - 'chr3:100000-200000', - 'chrX:100000-200000' - ) - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY( + 'chr1:100000-200000', + 'chr2:100000-200000', + 'chr3:100000-200000', + 'chrX:100000-200000' + ) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -190,7 +182,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **ranges** A comma-separated list of genomic range literals. @@ -208,49 +200,43 @@ Examples Find genes that contain all specified SNP positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS ALL( - 'chr1:1500', - 'chr1:1600', - 'chr1:1700' - ) - """) + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) **Ensure Complete Coverage:** Find intervals that span a set of required positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval CONTAINS ALL( - 'chr1:10000', - 'chr1:15000', - 'chr1:20000' - ) - """) + SELECT * FROM features + WHERE interval CONTAINS ALL( + 'chr1:10000', + 'chr1:15000', + 'chr1:20000' + ) **Find Overlapping Regions:** Find features that overlap with all specified windows (useful for finding features in the intersection of multiple regions): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ALL( - 'chr1:1000-2000', - 'chr1:1500-2500' - ) - """) + SELECT * FROM features + WHERE interval INTERSECTS ALL( + 'chr1:1000-2000', + 'chr1:1500-2500' + ) - # This finds features that overlap BOTH ranges - # (i.e., features in the intersection: chr1:1500-2000) + -- This finds features that overlap BOTH ranges + -- (i.e., features in the intersection: chr1:1500-2000) Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -285,22 +271,6 @@ Related - :ref:`ANY ` - Match any range (logical OR) - :ref:`CONTAINS ` - Base containment operator -Choosing Between ANY and ALL ----------------------------- - -Use **ANY** when you want to find features that match at least one of several criteria: - -.. code-block:: python - - # Find variants in gene A OR gene B OR gene C - WHERE interval INTERSECTS ANY('gene_a_region', 'gene_b_region', 'gene_c_region') - -Use **ALL** when you want to find features that satisfy all criteria simultaneously: - -.. code-block:: python - - # Find features that contain ALL of these positions - WHERE interval CONTAINS ALL('pos1', 'pos2', 'pos3') Common Patterns --------------- @@ -309,24 +279,20 @@ Common Patterns Find features that don't overlap any blacklisted region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE NOT interval INTERSECTS ANY( - 'chr1:1000000-2000000', -- Centromere - 'chr1:5000000-5500000' -- Known artifact region - ) - """) + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:1000000-2000000', -- Centromere + 'chr1:5000000-5500000' -- Known artifact region + ) **Combining ANY and ALL:** Complex queries can combine both quantifiers: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') diff --git a/docs/operators/spatial-operators.rst b/docs/dialect/spatial-operators.rst similarity index 75% rename from docs/operators/spatial-operators.rst rename to docs/dialect/spatial-operators.rst index 6b48001..fa1c7be 100644 --- a/docs/operators/spatial-operators.rst +++ b/docs/dialect/spatial-operators.rst @@ -1,5 +1,5 @@ -Spatial Relationship Operators -============================== +Spatial Relationships +===================== Spatial relationship operators test positional relationships between genomic ranges. These are the core operators for determining whether genomic intervals overlap, @@ -7,7 +7,7 @@ contain, or are contained within other intervals. .. contents:: :local: - :depth: 2 + :depth: 1 .. _intersects-operator: @@ -46,7 +46,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine via ``register_table_schema()``. + A genomic column from a registered table. **literal_range** A string literal specifying a genomic range in the format ``'chromosome:start-end'``. @@ -66,50 +66,42 @@ Examples Find all variants that overlap a specific genomic region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' **Column-to-Column Joins:** Find variants that overlap with any gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval **With WHERE Clause:** Find overlapping features with additional filtering: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND g.biotype = 'protein_coding' - """) + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' **Left Outer Join:** Find all variants, with gene information where available: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - LEFT JOIN genes g ON v.interval INTERSECTS g.interval - """) + SELECT v.*, g.name AS gene_name + FROM variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -134,7 +126,7 @@ Backend Compatibility Performance Notes ~~~~~~~~~~~~~~~~~ -- Create indexes on ``(chromosome, start_pos, end_pos)`` for better join performance +- Create indexes on ``(chrom, start, "end")`` for better join performance - When joining large tables, consider filtering by chromosome first - The generated SQL uses efficient range comparison predicates @@ -183,7 +175,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **literal_range** A string literal specifying a genomic point or range. @@ -203,36 +195,30 @@ Examples Find genes that contain a specific position: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS 'chr1:1500' - """) + SELECT * FROM genes + WHERE interval CONTAINS 'chr1:1500' **Range Containment:** Find large features that fully contain smaller features: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT g.name AS gene_name, e.name AS exon_name - FROM genes g - INNER JOIN exons e ON g.interval CONTAINS e.interval - """) + SELECT g.name AS gene_name, e.name AS exon_name + FROM genes g + INNER JOIN exons e ON g.interval CONTAINS e.interval **Filtering Fully Contained Variants:** Find variants that are completely within gene boundaries: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.* - FROM variants v - INNER JOIN genes g ON g.interval CONTAINS v.interval - """) + SELECT v.* + FROM variants v + INNER JOIN genes g ON g.interval CONTAINS v.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ @@ -295,7 +281,7 @@ Parameters ~~~~~~~~~~ **interval** - A genomic column registered with the engine. + A genomic column. **literal_range** A string literal specifying the containing range. @@ -315,24 +301,20 @@ Examples Find all features within a specific genomic window: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval WITHIN 'chr1:1000000-2000000' - """) + SELECT * FROM features + WHERE interval WITHIN 'chr1:1000000-2000000' **Find Nested Features:** Find exons that are completely within their parent gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT e.*, g.name AS gene_name - FROM exons e - INNER JOIN genes g ON e.interval WITHIN g.interval - """) + SELECT e.*, g.name AS gene_name + FROM exons e + INNER JOIN genes g ON e.interval WITHIN g.interval Backend Compatibility ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/reference/syntax-reference.rst b/docs/dialect/syntax-reference.rst similarity index 78% rename from docs/reference/syntax-reference.rst rename to docs/dialect/syntax-reference.rst index 48cfb14..0082e26 100644 --- a/docs/reference/syntax-reference.rst +++ b/docs/dialect/syntax-reference.rst @@ -5,7 +5,7 @@ Quick reference for GIQL syntax and operators. .. contents:: :local: - :depth: 2 + :depth: 1 Genomic Range Literals ---------------------- @@ -238,7 +238,7 @@ Exclusion (NOT IN) SELECT a.* FROM table_a a LEFT JOIN table_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL + WHERE b.chrom IS NULL Count Overlaps ~~~~~~~~~~~~~~ @@ -248,7 +248,7 @@ Count Overlaps SELECT a.*, COUNT(b.name) AS overlap_count FROM table_a a LEFT JOIN table_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, ... + GROUP BY a.chrom, a.start, a."end", ... K-Nearest Neighbors ~~~~~~~~~~~~~~~~~~~ @@ -266,7 +266,7 @@ Clustering SELECT *, CLUSTER(interval) AS cluster_id FROM table - ORDER BY chromosome, start_pos + ORDER BY chrom, start Merging ~~~~~~~ @@ -275,55 +275,3 @@ Merging SELECT MERGE(interval), COUNT(*) AS count FROM table - -Engine Methods --------------- - -execute() -~~~~~~~~~ - -Execute a GIQL query and return a cursor. - -.. code-block:: python - - cursor = engine.execute("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") - -transpile() -~~~~~~~~~~~ - -Convert GIQL to SQL without executing. - -.. code-block:: python - - sql = engine.transpile("SELECT * FROM table WHERE interval INTERSECTS 'chr1:1000-2000'") - -register_table_schema() -~~~~~~~~~~~~~~~~~~~~~~~ - -Register a table's schema for genomic operations. - -.. code-block:: python - - engine.register_table_schema( - "table_name", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - chromosome_column="chromosome", # optional, default: "chromosome" - start_column="start_pos", # optional, default: "start_pos" - end_column="end_pos", # optional, default: "end_pos" - ) - -load_csv() -~~~~~~~~~~ - -Load a CSV file into a table. - -.. code-block:: python - - engine.load_csv("table_name", "file.csv") - engine.load_csv("table_name", "file.tsv", delimiter="\t") diff --git a/docs/guides/engine.rst b/docs/guides/engine.rst new file mode 100644 index 0000000..71269be --- /dev/null +++ b/docs/guides/engine.rst @@ -0,0 +1,195 @@ +Execution engines +================= + +GIQL transpiles genomic queries to SQL that can be executed on any database +backend. This guide covers backend-specific considerations and tips. + +.. contents:: + :local: + :depth: 1 + +Supported Backends +------------------ + +GIQL generates SQL that works across database systems: + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Backend + - Status + - Best For + * - DuckDB + - Full Support + - Analytics, large datasets, in-memory processing + * - SQLite + - Full Support + - Lightweight, embedded, portable databases + * - PostgreSQL + - Planned + - Production deployments, shared databases + +Using with DuckDB +----------------- + +DuckDB is recommended for most genomic analysis use cases. It provides excellent +performance for analytical queries and handles large genomic datasets efficiently. + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["features"], + ) + + conn = duckdb.connect() + conn.execute("CREATE TABLE features AS SELECT * FROM read_csv('features.bed', delim='\t')") + result = conn.execute(sql).fetchdf() + +**Advantages:** + +- Fast analytical query performance +- Efficient columnar storage +- Good support for large datasets +- Rich SQL feature set +- In-memory and persistent options + +Using with SQLite +----------------- + +SQLite is a lightweight, embedded database suitable for smaller datasets or +when portability is important. + +.. code-block:: python + + import sqlite3 + from giql import transpile + + sql = transpile( + """ + SELECT * FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["features"], + ) + + conn = sqlite3.connect("data.db") + cursor = conn.execute(sql) + for row in cursor: + print(row) + +**Advantages:** + +- Zero configuration +- Single-file database +- Widely compatible +- Small memory footprint + +Writing Portable Queries +------------------------ + +Query Compatibility +~~~~~~~~~~~~~~~~~~~ + +GIQL queries are portable across backends. The same GIQL query produces SQL +that works on any supported database: + +.. code-block:: python + + from giql import transpile + + query = """ + SELECT a.*, b.name AS gene + FROM variants a + JOIN genes b ON a.interval INTERSECTS b.interval + WHERE a.quality >= 30 + """ + + # Same GIQL query works for any backend + sql = transpile(query, tables=["variants", "genes"]) + +Backend-Specific Features +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some SQL features may only be available on certain backends: + +.. list-table:: + :header-rows: 1 + :widths: 40 20 20 20 + + * - Feature + - DuckDB + - SQLite + - Notes + * - Window functions + - Yes + - Yes + - Full support + * - CTEs (WITH clause) + - Yes + - Yes + - Full support + * - LATERAL joins + - Yes + - Limited + - Used by NEAREST + * - STRING_AGG + - Yes + - GROUP_CONCAT + - Different function names + +Performance Comparison +---------------------- + +Backend Performance Characteristics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 30 35 35 + + * - Operation + - DuckDB + - SQLite + * - Large table scans + - Excellent (columnar) + - Good + * - Complex joins + - Excellent + - Good + * - Aggregations + - Excellent + - Good + * - Small queries + - Good + - Excellent + * - Memory usage + - Higher + - Lower + * - Startup time + - Faster + - Fast + +Choosing the Right Backend +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Choose DuckDB when:** + +- Working with large datasets (millions of features) +- Running complex analytical queries +- Performing heavy aggregations +- Memory is not constrained + +**Choose SQLite when:** + +- Working with smaller datasets +- Need maximum portability +- Memory is constrained +- Simple query patterns diff --git a/docs/guides/index.rst b/docs/guides/index.rst index c3265be..b7644d1 100644 --- a/docs/guides/index.rst +++ b/docs/guides/index.rst @@ -6,27 +6,21 @@ and best practices for using GIQL effectively. .. toctree:: :maxdepth: 2 + :hidden: schema-mapping - multi-backend + engine performance - transpilation - -Guide Overview --------------- :doc:`schema-mapping` Learn how to configure GIQL to work with your genomic data, including - registering table schemas and mapping logical genomic columns. + table configuration and mapping logical genomic columns. -:doc:`multi-backend` - Understand GIQL's multi-database support and how to work with different - backends like DuckDB, SQLite, and PostgreSQL. +:doc:`engine` + Understand how to use GIQL's transpiled SQL with different + execution engines like DuckDB, SQLite, and PostgreSQL. :doc:`performance` Optimize your GIQL queries for better performance with indexing strategies, query patterns, and backend-specific tips. -:doc:`transpilation` - Understand how GIQL translates queries to SQL, debug query generation, - and integrate transpiled SQL with external tools. diff --git a/docs/guides/multi-backend.rst b/docs/guides/multi-backend.rst deleted file mode 100644 index ecc3799..0000000 --- a/docs/guides/multi-backend.rst +++ /dev/null @@ -1,367 +0,0 @@ -Multi-Backend Guide -=================== - -GIQL supports multiple database backends, allowing you to run the same genomic -queries against different database systems. This guide covers backend selection, -configuration, and backend-specific considerations. - -.. contents:: - :local: - :depth: 2 - -Supported Backends ------------------- - -GIQL currently supports the following database backends: - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Status - - Best For - * - DuckDB - - Full Support - - Analytics, large datasets, in-memory processing - * - SQLite - - Full Support - - Lightweight, embedded, portable databases - * - PostgreSQL - - Planned - - Production deployments, shared databases - -Selecting a Backend -------------------- - -DuckDB (Recommended) -~~~~~~~~~~~~~~~~~~~~ - -DuckDB is the recommended backend for most use cases. It provides excellent -performance for analytical queries and handles large genomic datasets efficiently. - -.. code-block:: python - - from giql import GIQLEngine - - # In-memory DuckDB (default) - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("features", "features.bed") - # ... register schemas and query - - # Persistent DuckDB database - with GIQLEngine(target_dialect="duckdb", db_path="my_data.duckdb") as engine: - # Data persists between sessions - pass - -**Advantages:** - -- Fast analytical query performance -- Efficient columnar storage -- Good support for large datasets -- Rich SQL feature set -- In-memory and persistent options - -**Best for:** - -- Interactive analysis -- Large BED/VCF files -- Complex aggregations -- One-time analysis pipelines - -SQLite -~~~~~~ - -SQLite is a lightweight, embedded database suitable for smaller datasets or -when portability is important. - -.. code-block:: python - - # In-memory SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - pass - - # Persistent SQLite database - with GIQLEngine(target_dialect="sqlite", db_path="my_data.db") as engine: - pass - -**Advantages:** - -- Zero configuration -- Single-file database -- Widely compatible -- Small memory footprint - -**Best for:** - -- Small to medium datasets -- Portable analysis -- Embedded applications -- Simple workflows - -Backend Configuration ---------------------- - -In-Memory vs Persistent -~~~~~~~~~~~~~~~~~~~~~~~ - -Both DuckDB and SQLite support in-memory and persistent modes: - -.. code-block:: python - - # In-memory (data lost when engine closes) - with GIQLEngine(target_dialect="duckdb") as engine: - engine.load_csv("features", "features.bed") - # Data exists only during this session - - # Persistent (data saved to disk) - with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: - engine.load_csv("features", "features.bed") - # Data persists after engine closes - - # Reopen persistent database - with GIQLEngine(target_dialect="duckdb", db_path="analysis.duckdb") as engine: - # Previous data is available - cursor = engine.execute("SELECT * FROM features LIMIT 5") - -Connection Options -~~~~~~~~~~~~~~~~~~ - -Pass additional connection options to the underlying database: - -.. code-block:: python - - # DuckDB with custom settings - with GIQLEngine( - target_dialect="duckdb", - db_path="analysis.duckdb", - read_only=False, - ) as engine: - pass - -Writing Portable Queries ------------------------- - -Query Compatibility -~~~~~~~~~~~~~~~~~~~ - -GIQL queries are portable across backends. The same query works on any -supported database: - -.. code-block:: python - - query = """ - SELECT a.*, b.name AS gene - FROM variants a - JOIN genes b ON a.interval INTERSECTS b.interval - WHERE a.quality >= 30 - """ - - # Works on DuckDB - with GIQLEngine(target_dialect="duckdb") as engine: - # ... setup ... - cursor = engine.execute(query) - - # Same query works on SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - # ... setup ... - cursor = engine.execute(query) - -SQL Dialect Differences -~~~~~~~~~~~~~~~~~~~~~~~ - -While GIQL queries are portable, the generated SQL differs between backends. -Use ``transpile()`` to see the backend-specific SQL: - -.. code-block:: python - - query = "SELECT * FROM features WHERE interval INTERSECTS 'chr1:1000-2000'" - - # DuckDB SQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - print(engine.transpile(query)) - - # SQLite SQL (may differ slightly) - with GIQLEngine(target_dialect="sqlite") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - print(engine.transpile(query)) - -Backend-Specific Features -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some SQL features may only be available on certain backends: - -.. list-table:: - :header-rows: 1 - :widths: 40 20 20 20 - - * - Feature - - DuckDB - - SQLite - - Notes - * - Window functions - - Yes - - Yes - - Full support - * - CTEs (WITH clause) - - Yes - - Yes - - Full support - * - LATERAL joins - - Yes - - Limited - - Used by NEAREST - * - STRING_AGG - - Yes - - GROUP_CONCAT - - Different function names - -Migrating Between Backends --------------------------- - -Exporting Data -~~~~~~~~~~~~~~ - -Export data from one backend for import into another: - -.. code-block:: python - - # Export from DuckDB - with GIQLEngine(target_dialect="duckdb", db_path="source.duckdb") as engine: - cursor = engine.execute("SELECT * FROM features") - import pandas as pd - df = pd.DataFrame(cursor.fetchall(), - columns=[desc[0] for desc in cursor.description]) - df.to_csv("features_export.csv", index=False) - - # Import to SQLite - with GIQLEngine(target_dialect="sqlite", db_path="target.db") as engine: - engine.load_csv("features", "features_export.csv") - engine.register_table_schema("features", {...}, genomic_column="interval") - -Schema Compatibility -~~~~~~~~~~~~~~~~~~~~ - -Ensure schema definitions work across backends: - -.. code-block:: python - - # Use portable type names - schema = { - "chromosome": "VARCHAR", # Works on all backends - "start_pos": "BIGINT", # Maps to appropriate integer type - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", # Maps to appropriate float type - } - - # Same schema works on both backends - for dialect in ["duckdb", "sqlite"]: - with GIQLEngine(target_dialect=dialect) as engine: - engine.load_csv("features", "features.csv") - engine.register_table_schema("features", schema, genomic_column="interval") - -Performance Comparison ----------------------- - -Backend Performance Characteristics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 30 35 35 - - * - Operation - - DuckDB - - SQLite - * - Large table scans - - Excellent (columnar) - - Good - * - Complex joins - - Excellent - - Good - * - Aggregations - - Excellent - - Good - * - Small queries - - Good - - Excellent - * - Memory usage - - Higher - - Lower - * - Startup time - - Faster - - Fast - -Choosing the Right Backend -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Choose DuckDB when:** - -- Working with large datasets (millions of features) -- Running complex analytical queries -- Performing heavy aggregations -- Memory is not constrained - -**Choose SQLite when:** - -- Working with smaller datasets -- Need maximum portability -- Memory is constrained -- Simple query patterns - -Using External Connections --------------------------- - -Connecting to Existing Databases -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Connect to databases created outside of GIQL: - -.. code-block:: python - - # Connect to existing DuckDB database - with GIQLEngine(target_dialect="duckdb", db_path="existing.duckdb") as engine: - # Register schemas for existing tables - engine.register_table_schema( - "my_existing_table", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) - - # Query existing data with GIQL operators - cursor = engine.execute(""" - SELECT * FROM my_existing_table - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Using Transpiled SQL Externally -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Generate SQL for use with external database connections: - -.. code-block:: python - - import duckdb - - # Get transpiled SQL from GIQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("features", {...}, genomic_column="interval") - sql = engine.transpile(""" - SELECT * FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with external connection - conn = duckdb.connect("my_database.duckdb") - result = conn.execute(sql).fetchall() - conn.close() - -This is useful when integrating GIQL with existing database workflows or -when you need more control over the database connection. diff --git a/docs/guides/performance.rst b/docs/guides/performance.rst index c0c4e51..019416e 100644 --- a/docs/guides/performance.rst +++ b/docs/guides/performance.rst @@ -6,7 +6,7 @@ indexing, query patterns, and backend-specific optimizations. .. contents:: :local: - :depth: 2 + :depth: 1 Understanding Query Performance ------------------------------- @@ -14,11 +14,11 @@ Understanding Query Performance How GIQL Queries Execute ~~~~~~~~~~~~~~~~~~~~~~~~ -When you execute a GIQL query: +When you use GIQL: 1. GIQL parses the query and identifies genomic operators -2. Operators are expanded into standard SQL predicates -3. The SQL is sent to the database backend +2. Operators are expanded into SQL predicates +3. You execute the SQL on your database backend 4. The database executes the query using its optimizer Performance depends on both the generated SQL and how the database executes it. @@ -39,19 +39,11 @@ Creating Indexes Create indexes on genomic columns for faster queries: -.. code-block:: python - - # DuckDB - engine.conn.execute(""" - CREATE INDEX idx_features_position - ON features (chromosome, start_pos, end_pos) - """) +.. code-block:: sql - # SQLite - engine.conn.execute(""" - CREATE INDEX idx_features_position - ON features (chromosome, start_pos, end_pos) - """) + -- DuckDB or SQLite + CREATE INDEX idx_features_position + ON features (chrom, start, "end") Recommended Index Patterns ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,21 +52,21 @@ Recommended Index Patterns .. code-block:: sql - CREATE INDEX idx_table_position ON table_name (chromosome, start_pos, end_pos) + CREATE INDEX idx_table_position ON table_name (chrom, start, "end") **For join queries:** .. code-block:: sql -- Index both tables involved in joins - CREATE INDEX idx_variants_position ON variants (chromosome, start_pos, end_pos) - CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + CREATE INDEX idx_variants_position ON variants (chrom, start, "end") + CREATE INDEX idx_genes_position ON genes (chrom, start, "end") **For strand-specific queries:** .. code-block:: sql - CREATE INDEX idx_features_strand ON features (chromosome, strand, start_pos, end_pos) + CREATE INDEX idx_features_strand ON features (chrom, strand, start, "end") When to Create Indexes ~~~~~~~~~~~~~~~~~~~~~~ @@ -100,88 +92,55 @@ Pre-filter by Chromosome Always include chromosome filtering when joining tables: -.. code-block:: python +.. code-block:: sql - # Good: Explicit chromosome filter - cursor = engine.execute(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE a.chromosome = 'chr1' - """) - - # Also good: Cross-chromosome join with implicit filtering - # GIQL handles this, but explicit is clearer - cursor = engine.execute(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - AND a.chromosome = b.chromosome - """) + -- Good: Explicit chromosome filter + SELECT a.*, b.name + FROM features_a a + JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE a.chrom = 'chr1' Use Selective Filters Early ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Apply selective filters before joins: -.. code-block:: python +.. code-block:: sql - # Good: Filter before joining - cursor = engine.execute(""" - WITH filtered_variants AS ( - SELECT * FROM variants - WHERE quality >= 30 AND filter = 'PASS' - ) - SELECT f.*, g.name - FROM filtered_variants f - JOIN genes g ON f.interval INTERSECTS g.interval - """) - - # Less efficient: Filter after joining - cursor = engine.execute(""" - SELECT v.*, g.name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 AND v.filter = 'PASS' - """) + -- Good: Filter before joining + WITH filtered_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ) + SELECT f.*, g.name + FROM filtered_variants f + JOIN genes g ON f.interval INTERSECTS g.interval Limit Result Sets ~~~~~~~~~~~~~~~~~ Use LIMIT for exploratory queries: -.. code-block:: python +.. code-block:: sql - # Good: Limit results during exploration - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - LIMIT 100 - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' + LIMIT 100 Use DISTINCT Wisely ~~~~~~~~~~~~~~~~~~~ DISTINCT can be expensive. Only use when necessary: -.. code-block:: python +.. code-block:: sql - # Only use DISTINCT when you actually need unique rows - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - """) - - # If you just need to check existence, use EXISTS instead - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - WHERE EXISTS ( - SELECT 1 FROM features_b b - WHERE a.interval INTERSECTS b.interval - ) - """) + -- If you just need to check existence, use EXISTS instead + SELECT a.* + FROM features_a a + WHERE EXISTS ( + SELECT 1 FROM features_b b + WHERE a.interval INTERSECTS b.interval + ) NEAREST Query Optimization -------------------------- @@ -193,35 +152,32 @@ The NEAREST operator can be expensive for large datasets. Optimize with: **1. Use max_distance to limit search space:** -.. code-block:: python +.. code-block:: sql - # Good: Constrained search - cursor = engine.execute(""" - SELECT peaks.name, nearest.name, nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 -- Only search within 100kb - ) AS nearest - """) + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 -- Only search within 100kb + ) AS nearest **2. Request only the k you need:** -.. code-block:: python +.. code-block:: sql - # Good: Request exactly what you need + -- Good: Request exactly what you need NEAREST(genes, reference=peaks.interval, k=3) - # Wasteful: Request more than needed + -- Wasteful: Request more than needed NEAREST(genes, reference=peaks.interval, k=100) **3. Index the target table:** .. code-block:: sql - CREATE INDEX idx_genes_position ON genes (chromosome, start_pos, end_pos) + CREATE INDEX idx_genes_position ON genes (chrom, start, "end") Merge and Cluster Optimization ------------------------------ @@ -231,34 +187,28 @@ Efficient Clustering For large datasets, consider pre-sorting: -.. code-block:: python +.. code-block:: sql - # Pre-sort data for clustering - cursor = engine.execute(""" - WITH sorted AS ( - SELECT * FROM features - ORDER BY chromosome, start_pos - ) - SELECT *, CLUSTER(interval) AS cluster_id - FROM sorted - """) + WITH sorted AS ( + SELECT * FROM features + ORDER BY chrom, start + ) + SELECT *, CLUSTER(interval) AS cluster_id + FROM sorted Efficient Merging ~~~~~~~~~~~~~~~~~ Filter before merging to reduce data volume: -.. code-block:: python +.. code-block:: sql - # Good: Filter first, then merge - cursor = engine.execute(""" - WITH filtered AS ( - SELECT * FROM features - WHERE score >= 10 - ) - SELECT MERGE(interval), COUNT(*) AS count - FROM filtered - """) + WITH filtered AS ( + SELECT * FROM features + WHERE score >= 10 + ) + SELECT MERGE(interval), COUNT(*) AS count + FROM filtered Analyzing Query Performance --------------------------- @@ -266,43 +216,24 @@ Analyzing Query Performance Using EXPLAIN ~~~~~~~~~~~~~ -Analyze query execution plans: +Analyze query execution plans by running EXPLAIN on the transpiled SQL: .. code-block:: python - # Get the transpiled SQL - sql = engine.transpile(""" + from giql import transpile + + sql = transpile( + """ SELECT a.*, b.name FROM variants a JOIN genes b ON a.interval INTERSECTS b.interval - """) - - # Analyze the execution plan - cursor = engine.execute(f"EXPLAIN {sql}") - for row in cursor: - print(row) + """, + tables=["variants", "genes"], + ) + # Run EXPLAIN on your database connection + # conn.execute(f"EXPLAIN {sql}") # DuckDB also supports EXPLAIN ANALYZE for actual timing - cursor = engine.execute(f"EXPLAIN ANALYZE {sql}") - -Timing Queries -~~~~~~~~~~~~~~ - -Measure query execution time: - -.. code-block:: python - - import time - - start = time.time() - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - """) - results = cursor.fetchall() - elapsed = time.time() - start - - print(f"Query returned {len(results)} rows in {elapsed:.2f} seconds") Backend-Specific Tips --------------------- @@ -314,21 +245,12 @@ DuckDB Optimizations DuckDB is columnar, so queries that select few columns are faster: -.. code-block:: python - - # Faster: Select only needed columns - cursor = engine.execute(""" - SELECT chromosome, start_pos, end_pos, name - FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) +.. code-block:: sql - # Slower: Select all columns - cursor = engine.execute(""" - SELECT * - FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + -- Faster: Select only needed columns + SELECT chrom, start, "end", name + FROM features + WHERE interval INTERSECTS 'chr1:1000-2000' **Parallel execution:** @@ -344,50 +266,13 @@ SQLite Optimizations -- Include commonly selected columns in the index CREATE INDEX idx_features_covering - ON features (chromosome, start_pos, end_pos, name, score) + ON features (chrom, start, "end", name, score) **Analyze tables:** -.. code-block:: python - - # Help SQLite's query planner - engine.conn.execute("ANALYZE features") - -Memory Management ------------------ - -Streaming Results -~~~~~~~~~~~~~~~~~ - -For large result sets, iterate instead of fetching all: - -.. code-block:: python - - # Good: Stream results - cursor = engine.execute("SELECT * FROM large_table") - for row in cursor: - process(row) - - # Memory-intensive: Fetch all at once - cursor = engine.execute("SELECT * FROM large_table") - all_rows = cursor.fetchall() # Loads everything into memory - -Batch Processing -~~~~~~~~~~~~~~~~ - -Process large datasets in batches: - -.. code-block:: python - - chromosomes = ['chr1', 'chr2', 'chr3', ...] # All chromosomes +.. code-block:: sql - for chrom in chromosomes: - cursor = engine.execute(f""" - SELECT * FROM features - WHERE chromosome = '{chrom}' - AND interval INTERSECTS '{chrom}:1-1000000' - """) - process_chromosome(cursor) + ANALYZE features Performance Checklist --------------------- @@ -396,13 +281,13 @@ Before running large queries, check: .. code-block:: text - □ Indexes created on genomic columns - □ Chromosome filtering included in joins - □ Selective filters applied early - □ LIMIT used for exploration - □ Only necessary columns selected - □ NEAREST queries use max_distance - □ Results streamed instead of fetched all at once + - Indexes created on genomic columns + - Chromosome filtering included in joins + - Selective filters applied early + - LIMIT used for exploration + - Only necessary columns selected + - NEAREST queries use max_distance + - Results streamed instead of fetched all at once Quick Wins ~~~~~~~~~~ diff --git a/docs/guides/quickstart.rst b/docs/guides/quickstart.rst new file mode 100644 index 0000000..ef7c3ae --- /dev/null +++ b/docs/guides/quickstart.rst @@ -0,0 +1,175 @@ +Quick Start +=========== + +GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing +you to express complex genomic range operations without writing intricate +SQL expressions. GIQL queries read naturally, making your analysis code +easier to review and share. GIQL operators follow established conventions +around genomic spatial relationships, so the semantics are familiar and +predictable. + +- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships +- **Distance operators**: DISTANCE, NEAREST for proximity queries +- **Aggregation operators**: CLUSTER, MERGE for combining intervals +- **Set quantifiers**: ANY, ALL for multi-range queries +- **Range parsing**: Understands genomic range strings and coordinate systems +- **Transpilation**: Converts GIQL to standard SQL-92 compatible output for execution on any backend + +Installation +------------ + +Install GIQL using pip: + +.. code-block:: bash + + pip install giql + +Basic Usage +----------- + +Table Configuration +~~~~~~~~~~~~~~~~~~~ + +GIQL works with genomic data stored in tables with separate columns for chromosome, +start position, and end position. The default column names are: + +* **chrom**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') +* **start**: Start position of the genomic interval (0-based, inclusive) +* **end**: End position of the genomic interval (0-based, exclusive, half-open) +* **strand** (optional): Strand orientation ('+', '-', or '.') + +If your table uses the default column names, you can pass just the table name +as a string. For custom column names, use a ``Table`` object: + +.. code-block:: python + + from giql import Table, transpile + + # Default column names (chrom, start, end, strand) + sql = transpile(query, tables=["peaks"]) + + # Custom column names + sql = transpile( + query, + tables=[ + Table( + "variants", + genomic_col="interval", + chrom_col="chromosome", + start_col="start_pos", + end_col="end_pos", + ) + ], + ) + +After configuration, you can use the genomic pseudo-column (default: ``interval``) +in your GIQL queries, and the transpiler will automatically expand it to the +physical column comparisons. + +Query with DuckDB +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = duckdb.connect() + conn.execute("CREATE TABLE variants AS SELECT * FROM read_csv('variants.csv')") + df = conn.execute(sql).fetchdf() + +Query with SQLite +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import sqlite3 + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = sqlite3.connect("data.db") + cursor = conn.execute(sql) + for row in cursor: + print(row) + +Spatial Operators +----------------- + +INTERSECTS +~~~~~~~~~~ + +Check if genomic ranges overlap: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + +CONTAINS +~~~~~~~~ + +Check if a range contains a point or another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS 'chr1:1500' + +WITHIN +~~~~~~ + +Check if a range is within another range: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval WITHIN 'chr1:1000-5000' + +Set Quantifiers +--------------- + +ANY +~~~ + +Match any of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + +ALL +~~~ + +Match all of the specified ranges: + +.. code-block:: sql + + SELECT * FROM variants + WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') + +Column-to-Column Joins +---------------------- + +Join tables on genomic position: + +.. code-block:: sql + + SELECT v.*, g.name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval diff --git a/docs/guides/schema-mapping.rst b/docs/guides/schema-mapping.rst index f515695..43c580c 100644 --- a/docs/guides/schema-mapping.rst +++ b/docs/guides/schema-mapping.rst @@ -1,20 +1,20 @@ -Schema Mapping Guide -==================== +Schema Mapping +============== This guide explains how to configure GIQL to work with your genomic data by -registering table schemas and mapping logical genomic columns. +defining table configurations that map logical genomic columns to physical columns. .. contents:: :local: - :depth: 2 + :depth: 1 Understanding Schema Mapping ---------------------------- GIQL needs to know how your genomic data is structured in order to translate -genomic operators into SQL. This is done through schema registration, which -maps a logical "genomic column" (used in your queries) to the physical columns -in your database tables. +genomic operators into SQL. This is done through ``Table`` objects, which +map a logical "genomic column" (used in your queries) to the physical columns +in your files, data frames, or database tables. The Core Concept ~~~~~~~~~~~~~~~~ @@ -30,188 +30,126 @@ Behind the scenes, GIQL expands this to actual column comparisons: .. code-block:: sql SELECT * FROM variants - WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 + WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 -Schema registration tells GIQL which physical columns (``chromosome``, -``start_pos``, ``end_pos``) correspond to the logical ``interval`` column. +The ``Table`` configuration tells GIQL which physical columns (``chrom``, +``start``, ``end``) correspond to the logical ``interval`` column. -Registering Table Schemas -------------------------- +Configuring Tables +------------------ -Basic Registration -~~~~~~~~~~~~~~~~~~ +Basic Configuration +~~~~~~~~~~~~~~~~~~~ -Register a table schema using ``register_table_schema()``: +For tables that use the default column names (``chrom``, ``start``, ``end``, +``strand``), pass the table name as a string: .. code-block:: python - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load data - engine.load_csv("variants", "variants.csv") - - # Register schema - engine.register_table_schema( - "variants", # Table name - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "quality": "FLOAT", - }, - genomic_column="interval", # Logical column name for queries - ) - - # Now you can use 'interval' in queries - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Required Columns -~~~~~~~~~~~~~~~~ - -For schema registration, your table must have columns that map to: + from giql import transpile -- **chromosome**: The chromosome/contig identifier (e.g., 'chr1', 'chrX') -- **start_pos**: The start position of the genomic interval (0-based, inclusive) -- **end_pos**: The end position of the genomic interval (0-based, exclusive) - -GIQL looks for these column names by default. If your columns have different -names, see :ref:`custom-column-names`. - -Optional Strand Column -~~~~~~~~~~~~~~~~~~~~~~ + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) -If your data includes strand information, include it in the schema: +Default Columns +~~~~~~~~~~~~~~~ -.. code-block:: python +GIQL uses these default column names: - engine.register_table_schema( - "features", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", # '+', '-', or '.' - "name": "VARCHAR", - }, - genomic_column="interval", - ) +- **chrom**: The chromosome/contig identifier (e.g., 'chr1', 'chrX') +- **start**: The start position of the genomic interval (0-based, inclusive) +- **end**: The end position of the genomic interval (0-based, exclusive) +- **strand**: Strand orientation ('+', '-', or '.'), optional -The strand column enables strand-specific operations in operators like -CLUSTER and NEAREST. +The default genomic pseudo-column name is ``interval``. .. _custom-column-names: Custom Column Names ~~~~~~~~~~~~~~~~~~~ -If your table uses different column names for genomic coordinates, specify -the mapping explicitly: +If your table uses different column names, create a ``Table`` object with +the mapping: .. code-block:: python - engine.register_table_schema( - "my_table", - { - "chrom": "VARCHAR", # Your chromosome column - "chromStart": "BIGINT", # Your start column (UCSC-style) - "chromEnd": "BIGINT", # Your end column - "name": "VARCHAR", - }, - genomic_column="interval", - chromosome_column="chrom", # Map to your column name - start_column="chromStart", # Map to your column name - end_column="chromEnd", # Map to your column name + from giql import Table, transpile + + sql = transpile( + """ + SELECT * FROM my_table + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=[ + Table( + "my_table", + chrom_col="chrom", # Your chromosome column + start_col="chromStart", # Your start column (UCSC-style) + end_col="chromEnd", # Your end column + ) + ], ) Multiple Tables --------------- -Register Multiple Tables -~~~~~~~~~~~~~~~~~~~~~~~~ +Configuring Multiple Tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Register all tables that will participate in genomic queries: +Pass all tables that participate in genomic queries: .. code-block:: python - with GIQLEngine(target_dialect="duckdb") as engine: - # Load data files - engine.load_csv("variants", "variants.bed") - engine.load_csv("genes", "genes.bed") - engine.load_csv("regulatory", "regulatory.bed") - - # Define common schema - bed_schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", - "strand": "VARCHAR", - } - - # Register each table - for table in ["variants", "genes", "regulatory"]: - engine.register_table_schema( - table, - bed_schema, - genomic_column="interval", - ) + from giql import transpile - # Now you can join tables using genomic operators - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - """) + # Tables with default column names + sql = transpile( + """ + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + """, + tables=["variants", "genes"], + ) Different Schemas Per Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tables can have different schemas and even different genomic column names: +Tables can have different column names and even different genomic column names. +Mix strings (for default columns) with ``Table`` objects (for custom columns): .. code-block:: python - # Variants table with VCF-style columns - engine.register_table_schema( - "variants", - { - "CHROM": "VARCHAR", - "POS": "BIGINT", - "END": "BIGINT", - "ID": "VARCHAR", - "QUAL": "FLOAT", - }, - genomic_column="var_interval", - chromosome_column="CHROM", - start_column="POS", - end_column="END", - ) - - # Genes table with BED-style columns - engine.register_table_schema( - "genes", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "gene_name": "VARCHAR", - "strand": "VARCHAR", - }, - genomic_column="gene_interval", - ) + from giql import Table, transpile - # Query using different genomic column names - cursor = engine.execute(""" + sql = transpile( + """ SELECT v.ID, g.gene_name FROM variants v JOIN genes g ON v.var_interval INTERSECTS g.gene_interval - """) + """, + tables=[ + # VCF-style columns + Table( + "variants", + genomic_col="var_interval", + chrom_col="CHROM", + start_col="POS", + end_col="END", + strand_col=None, + ), + # BED-style columns (defaults) + Table( + "genes", + genomic_col="gene_interval", + ), + ], + ) Coordinate Systems ------------------ @@ -219,7 +157,7 @@ Coordinate Systems Understanding BED Coordinates ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -GIQL uses the BED coordinate convention: +GIQL uses the BED coordinate convention by default: - **0-based start**: The first base of a chromosome is position 0 - **Half-open intervals**: Start is inclusive, end is exclusive @@ -227,35 +165,25 @@ GIQL uses the BED coordinate convention: Example: An interval ``chr1:100-200`` covers bases 100 through 199 (100 bases total). -Converting from 1-Based Coordinates -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Working with 1-Based Coordinates +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If your data uses 1-based coordinates (like VCF or GFF), convert when loading: +If your data uses 1-based coordinates (like VCF or GFF), configure the +``Table`` accordingly: .. code-block:: python - import pandas as pd - - # Load 1-based data - df = pd.read_csv("variants.vcf", sep="\t") - - # Convert to 0-based - df['start_pos'] = df['POS'] - 1 # Convert 1-based to 0-based - df['end_pos'] = df['POS'] # For SNPs, end = start + 1 + from giql import Table, transpile - # Load into engine - engine.conn.execute("CREATE TABLE variants AS SELECT * FROM df") - - # Register schema - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - # ... other columns - }, - genomic_column="interval", + sql = transpile( + query, + tables=[ + Table( + "variants", + coordinate_system="1based", + interval_type="closed", + ) + ], ) Working with Point Features @@ -267,144 +195,9 @@ For point features (like SNPs), create an interval of length 1: # For a SNP at position 1000 (1-based) # 0-based interval: [999, 1000) - start_pos = 999 - end_pos = 1000 - -Data Types ----------- - -Recommended Column Types -~~~~~~~~~~~~~~~~~~~~~~~~ - -For optimal performance, use appropriate data types: - -.. list-table:: - :header-rows: 1 - :widths: 25 25 50 - - * - Column - - Recommended Type - - Notes - * - chromosome - - VARCHAR - - String type for chromosome names - * - start_pos - - BIGINT - - 64-bit integer for large genomes - * - end_pos - - BIGINT - - 64-bit integer for large genomes - * - strand - - VARCHAR(1) or CHAR(1) - - Single character: '+', '-', '.' - * - score - - FLOAT or DOUBLE - - Numeric scores - * - name - - VARCHAR - - Feature identifiers - -Type Compatibility -~~~~~~~~~~~~~~~~~~ - -GIQL schemas use SQL type names. Common mappings: - -.. list-table:: - :header-rows: 1 - :widths: 30 35 35 - - * - GIQL Schema Type - - DuckDB Type - - SQLite Type - * - INTEGER - - INTEGER - - INTEGER - * - BIGINT - - BIGINT - - INTEGER - * - VARCHAR - - VARCHAR - - TEXT - * - FLOAT - - FLOAT - - REAL - * - DOUBLE - - DOUBLE - - REAL - -Loading Data ------------- - -From CSV Files -~~~~~~~~~~~~~~ - -Load CSV files directly: - -.. code-block:: python - - engine.load_csv("features", "features.csv") - - # With custom options - engine.load_csv( - "features", - "features.tsv", - delimiter="\t", - header=True, - ) - -From Pandas DataFrames -~~~~~~~~~~~~~~~~~~~~~~ - -Load data from pandas: - -.. code-block:: python - - import pandas as pd - - df = pd.read_csv("features.bed", sep="\t", header=None, - names=["chromosome", "start_pos", "end_pos", "name"]) - - # Register the DataFrame as a table - engine.conn.execute("CREATE TABLE features AS SELECT * FROM df") - - # Then register the schema - engine.register_table_schema( - "features", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) + start = 999 + end = 1000 -From Existing Database Tables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If tables already exist in your database, just register their schemas: - -.. code-block:: python - - # Connect to existing database - with GIQLEngine(target_dialect="duckdb", db_path="my_database.duckdb") as engine: - # Register schemas for existing tables - engine.register_table_schema( - "existing_table", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - }, - genomic_column="interval", - ) - - # Query existing data - cursor = engine.execute(""" - SELECT * FROM existing_table - WHERE interval INTERSECTS 'chr1:1000-2000' - """) Troubleshooting --------------- @@ -414,32 +207,17 @@ Common Issues **"Unknown column" errors:** -- Ensure the table schema is registered before querying -- Check that the genomic column name in your query matches the registered name -- Verify column names in the schema match actual table columns +- Ensure the table is included in the ``tables`` parameter +- Check that the genomic column name in your query matches the configured name +- Verify column names in the ``Table`` object match actual table columns **Incorrect results:** - Verify your coordinate system (0-based vs 1-based) -- Check that start_pos < end_pos for all intervals +- Check that start < end for all intervals - Ensure chromosome names match between tables (e.g., 'chr1' vs '1') **Performance issues:** - See the :doc:`performance` guide for optimization tips -- Consider adding indexes on genomic columns - -Verifying Schema Registration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Check that schemas are registered correctly: - -.. code-block:: python - - # After registration, test with a simple query - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - print(sql) - # Should show expanded SQL with chromosome, start_pos, end_pos comparisons +- Consider adding indexes on genomic columns \ No newline at end of file diff --git a/docs/guides/transpilation.rst b/docs/guides/transpilation.rst deleted file mode 100644 index bd4c24a..0000000 --- a/docs/guides/transpilation.rst +++ /dev/null @@ -1,417 +0,0 @@ -Transpilation Guide -=================== - -GIQL works by transpiling genomic queries into standard SQL. This guide explains -how transpilation works, how to debug query generation, and how to use transpiled -SQL with external tools. - -.. contents:: - :local: - :depth: 2 - -How Transpilation Works ------------------------ - -The Transpilation Process -~~~~~~~~~~~~~~~~~~~~~~~~~ - -When you write a GIQL query: - -.. code-block:: sql - - SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000' - -GIQL performs these steps: - -1. **Parse**: Parse the SQL to identify GIQL-specific operators -2. **Expand**: Replace genomic operators with standard SQL predicates -3. **Generate**: Produce SQL for the target database dialect - -The result is standard SQL: - -.. code-block:: sql - - SELECT * FROM variants - WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Operator Expansion -~~~~~~~~~~~~~~~~~~ - -Each GIQL operator expands to specific SQL patterns: - -**INTERSECTS** expands to range overlap checks: - -.. code-block:: sql - - -- GIQL - a.interval INTERSECTS b.interval - - -- SQL (same chromosome, overlapping ranges) - a.chromosome = b.chromosome - AND a.start_pos < b.end_pos - AND a.end_pos > b.start_pos - -**CONTAINS** expands to containment checks: - -.. code-block:: sql - - -- GIQL - a.interval CONTAINS b.interval - - -- SQL - a.chromosome = b.chromosome - AND a.start_pos <= b.start_pos - AND a.end_pos >= b.end_pos - -**DISTANCE** expands to gap calculations: - -.. code-block:: sql - - -- GIQL - DISTANCE(a.interval, b.interval) - - -- SQL (simplified) - CASE - WHEN a.chromosome != b.chromosome THEN NULL - WHEN a.end_pos <= b.start_pos THEN b.start_pos - a.end_pos - WHEN b.end_pos <= a.start_pos THEN a.start_pos - b.end_pos - ELSE 0 - END - -Using the Transpile Method --------------------------- - -Basic Transpilation -~~~~~~~~~~~~~~~~~~~ - -Use ``transpile()`` to see generated SQL without executing: - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - print(sql) - # Output: SELECT * FROM variants - # WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Transpiling Complex Queries -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Transpilation works with all GIQL features: - -.. code-block:: python - - # Join query - sql = engine.transpile(""" - SELECT v.*, g.name AS gene_name - FROM variants v - JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - """) - print(sql) - - # NEAREST query - sql = engine.transpile(""" - SELECT peaks.name, nearest.name, nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) AS nearest - """) - print(sql) - - # Aggregation query - sql = engine.transpile(""" - SELECT MERGE(interval), COUNT(*) AS count - FROM features - """) - print(sql) - -Debugging with Transpilation ----------------------------- - -Understanding Query Expansion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use transpilation to understand what GIQL does: - -.. code-block:: python - - # See how ANY quantifier expands - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') - """) - print(sql) - # Shows the OR conditions for each range - - # See how join conditions expand - sql = engine.transpile(""" - SELECT a.*, b.name - FROM features_a a - JOIN features_b b ON a.interval INTERSECTS b.interval - """) - print(sql) - # Shows the full range comparison predicates - -Verbose Mode -~~~~~~~~~~~~ - -Enable verbose mode for detailed transpilation information: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - engine.register_table_schema("variants", {...}, genomic_column="interval") - - # Transpilation will print detailed information - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execution also shows transpilation details - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -Troubleshooting Transpilation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Query not expanding correctly:** - -.. code-block:: python - - # Check that schema is registered - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - if "interval INTERSECTS" in sql: - print("Schema not registered for 'variants' table") - -**Wrong column names in output:** - -.. code-block:: python - - # Verify column mapping - engine.register_table_schema( - "variants", - {...}, - genomic_column="interval", - chromosome_column="chrom", # Check these match your table - start_column="start", - end_column="end", - ) - -Comparing Dialects ------------------- - -Same Query, Different SQL -~~~~~~~~~~~~~~~~~~~~~~~~~ - -See how the same query translates for different backends: - -.. code-block:: python - - query = """ - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """ - - schema = { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - } - - # DuckDB - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("variants", schema, genomic_column="interval") - print("DuckDB SQL:") - print(engine.transpile(query)) - print() - - # SQLite - with GIQLEngine(target_dialect="sqlite") as engine: - engine.register_table_schema("variants", schema, genomic_column="interval") - print("SQLite SQL:") - print(engine.transpile(query)) - -Dialect-Specific Differences -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some queries may generate different SQL for different dialects: - -- String functions may use different names -- Type casting syntax may vary -- Window function support may differ - -GIQL handles these differences automatically, but understanding them helps -when debugging or integrating with external tools. - -Using Transpiled SQL Externally -------------------------------- - -With External Database Connections -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use transpiled SQL with your own database connections: - -.. code-block:: python - - import duckdb - - # Generate SQL using GIQL - with GIQLEngine(target_dialect="duckdb") as giql_engine: - giql_engine.register_table_schema("variants", {...}, genomic_column="interval") - sql = giql_engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with external connection - conn = duckdb.connect("my_database.duckdb") - result = conn.execute(sql).fetchall() - conn.close() - -With ORMs and Query Builders -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Integrate transpiled SQL with SQLAlchemy or other ORMs: - -.. code-block:: python - - from sqlalchemy import create_engine, text - - # Generate SQL - with GIQLEngine(target_dialect="duckdb") as giql_engine: - giql_engine.register_table_schema("variants", {...}, genomic_column="interval") - sql = giql_engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Execute with SQLAlchemy - sa_engine = create_engine("duckdb:///my_database.duckdb") - with sa_engine.connect() as conn: - result = conn.execute(text(sql)) - for row in result: - print(row) - -Building SQL Pipelines -~~~~~~~~~~~~~~~~~~~~~~ - -Use transpilation in data pipelines: - -.. code-block:: python - - def build_intersection_query(table_a, table_b, region): - """Generate SQL for intersection query.""" - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema(table_a, {...}, genomic_column="interval") - engine.register_table_schema(table_b, {...}, genomic_column="interval") - - return engine.transpile(f""" - SELECT a.*, b.name - FROM {table_a} a - JOIN {table_b} b ON a.interval INTERSECTS b.interval - WHERE a.interval INTERSECTS '{region}' - """) - - # Use in pipeline - sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000") - # Execute sql with your preferred method - -Saving Queries -~~~~~~~~~~~~~~ - -Save transpiled SQL for documentation or reuse: - -.. code-block:: python - - # Generate and save SQL - with GIQLEngine(target_dialect="duckdb") as engine: - engine.register_table_schema("variants", {...}, genomic_column="interval") - - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - with open("query.sql", "w") as f: - f.write(sql) - - # Later, execute saved SQL - with open("query.sql") as f: - sql = f.read() - - conn = duckdb.connect("database.duckdb") - result = conn.execute(sql).fetchall() - -Advanced Transpilation ----------------------- - -Parameterized Queries -~~~~~~~~~~~~~~~~~~~~~ - -Build queries with parameters: - -.. code-block:: python - - def query_region(engine, chrom, start, end): - """Query a parameterized region.""" - region = f"{chrom}:{start}-{end}" - return engine.execute(f""" - SELECT * FROM variants - WHERE interval INTERSECTS '{region}' - """) - - # Use with different regions - cursor = query_region(engine, "chr1", 1000000, 2000000) - cursor = query_region(engine, "chr2", 5000000, 6000000) - -Dynamic Query Building -~~~~~~~~~~~~~~~~~~~~~~ - -Build queries programmatically: - -.. code-block:: python - - def build_multi_table_query(tables, target_region): - """Build a query that unions results from multiple tables.""" - union_parts = [] - for table in tables: - union_parts.append(f""" - SELECT *, '{table}' AS source FROM {table} - WHERE interval INTERSECTS '{target_region}' - """) - - query = " UNION ALL ".join(union_parts) - return engine.transpile(query) - -Inspecting the AST -~~~~~~~~~~~~~~~~~~ - -For advanced debugging, you can inspect the parsed query: - -.. code-block:: python - - # GIQL uses sqlglot internally - # The transpiled SQL shows the final result - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - - # For deep debugging, examine the generated SQL structure - print(sql) diff --git a/docs/index.rst b/docs/index.rst index 9918a00..b595529 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,140 +1,49 @@ -GIQL - Genomic Interval Query Language +Genomic Interval Query Language (GIQL) ====================================== -**GIQL** is a SQL dialect for genomic range queries with multi-database support. +**GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations. -Genomic analysis often requires repetitive, complex SQL patterns to express simple operations like finding overlapping intervals or merging features. GIQL extends SQL with dedicated operators for these common tasks, so you can declaratively express *what* you want to compute without getting lost in SQL boilerplate. GIQL queries read naturally, even without SQL expertise - this clarity makes your analysis code easier to review and share. Best of all, GIQL queries work across DuckDB, SQLite, PostgreSQL, and other databases, so you're never locked into a specific engine and can choose the tool that fits your use case. Finally, GIQL operators follow established conventions from tools like bedtools, so the semantics are familiar and predictable. +Dialect +------- +GIQL extends the SQL query language with dedicated constructs for these +common tasks, allowing you to declare *what* you want to compute rather +than how. Whether you're filtering variants by genomic region, finding +overlapping features, or calculating distances between intervals, GIQL +makes these operations intuitive and portable. .. toctree:: - :maxdepth: 2 - :caption: Getting Started + :maxdepth: 1 + :caption: Dialect - quickstart + dialect/index + dialect/syntax-reference -.. toctree:: - :maxdepth: 2 - :caption: Operator Reference - - operators/index +Transpilation +------------- +The ``giql`` package *transpiles* queries written in GIQL to regular SQL +for use in existing database systems and analytics engines. .. toctree:: - :maxdepth: 2 - :caption: Guides + :maxdepth: 1 + :caption: Transpilation - guides/index + transpilation/index + transpilation/execution + transpilation/api-reference -.. toctree:: - :maxdepth: 2 - :caption: Recipes - recipes/index +Learn more +---------- +See the following guides to learn how to use GIQL effectively: .. toctree:: - :maxdepth: 2 - :caption: Reference - - reference/syntax-reference - api/index - -Quick Start ------------ - -Install GIQL: - -.. code-block:: bash - - pip install giql - -Basic usage: - -.. code-block:: python - - from giql import GIQLEngine - - # Create engine with DuckDB backend - with GIQLEngine(target_dialect="duckdb") as engine: - # Load genomic data - engine.load_csv("variants", "variants.csv") - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Query with genomic operators (returns cursor for streaming) - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Process results - for row in cursor: - print(row) - - # Or just transpile to SQL without executing - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - print(sql) # See the generated SQL - -Features --------- + :maxdepth: 1 + :caption: Guides and Recipes -* **SQL-based**: Familiar SQL syntax with genomic extensions -* **Multi-backend**: Works with DuckDB, SQLite, and more -* **Spatial operators**: INTERSECTS, CONTAINS, WITHIN, DISTANCE, NEAREST -* **Aggregation operators**: CLUSTER, MERGE for combining intervals -* **Set quantifiers**: ANY, ALL for multi-range queries -* **Column-to-column joins**: Join tables on genomic position -* **Transpilation**: Convert GIQL to standard SQL for debugging or external use - -Operators at a Glance ---------------------- - -**Spatial Relationships:** - -.. code-block:: sql - - -- Find overlapping features - WHERE interval INTERSECTS 'chr1:1000-2000' - - -- Find containing/contained features - WHERE gene.interval CONTAINS variant.interval - -**Distance and Proximity:** - -.. code-block:: sql - - -- Calculate distance between intervals - SELECT DISTANCE(a.interval, b.interval) AS dist - - -- Find k-nearest neighbors - FROM peaks CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=5) - -**Aggregation:** - -.. code-block:: sql - - -- Cluster overlapping intervals - SELECT *, CLUSTER(interval) AS cluster_id FROM features - - -- Merge overlapping intervals - SELECT MERGE(interval) FROM features - -**Set Quantifiers:** - -.. code-block:: sql - - -- Match any of multiple regions - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr2:5000-6000') + guides/quickstart + guides/index + recipes/index -See :doc:`operators/index` for complete operator documentation. Indices and tables ================== diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index 9560c34..0000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,228 +0,0 @@ -Quick Start -=========== - -Installation ------------- - -Install GIQL using pip: - -.. code-block:: bash - - pip install giql - -Or with optional dependencies: - -.. code-block:: bash - - pip install giql[duckdb] # For DuckDB support - -Basic Usage ------------ - -Expected Schema -~~~~~~~~~~~~~~~ - -GIQL works with genomic data stored in tables with separate columns for chromosome, -start position, and end position. The typical schema includes: - -* **chromosome**: Chromosome identifier (e.g., 'chr1', 'chr2', 'chrX') -* **start_pos**: Start position of the genomic interval (0-based, inclusive) -* **end_pos**: End position of the genomic interval (0-based, exclusive, half-open) -* **strand** (optional): Strand orientation ('+', '-', or '.') - -You must register the table schema with GIQL, mapping the logical genomic column -(used in queries) to the physical columns in your table: - -.. code-block:: python - - engine.register_table_schema( - "table_name", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "strand": "VARCHAR", # Optional - # ... other columns ... - }, - genomic_column="interval", # Logical name used in queries - ) - -After registration, you can use ``interval`` in your GIQL queries, and the engine -will automatically map it to the ``chromosome``, ``start_pos``, and ``end_pos`` -columns. - -Query with DuckDB -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load CSV file into database - engine.load_csv("variants", "variants.csv") - - # Register schema mapping - engine.register_table_schema( - "variants", - { - "id": "INTEGER", - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Query using the logical 'interval' column (returns cursor for streaming) - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Process results lazily - for row in cursor: - print(row) - - # Or materialize to pandas DataFrame - import pandas as pd - cursor = engine.execute("SELECT ...") - df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description]) - -Query with SQLite -~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="sqlite", db_path="data.db") as engine: - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - # Iterate results - for row in cursor: - print(row) - -Spatial Operators ------------------ - -INTERSECTS -~~~~~~~~~~ - -Check if genomic ranges overlap: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - -CONTAINS -~~~~~~~~ - -Check if a range contains a point or another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS 'chr1:1500' - -WITHIN -~~~~~~ - -Check if a range is within another range: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval WITHIN 'chr1:1000-5000' - -Set Quantifiers ---------------- - -ANY -~~~ - -Match any of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - -ALL -~~~ - -Match all of the specified ranges: - -.. code-block:: sql - - SELECT * FROM variants - WHERE interval CONTAINS ALL('chr1:1500', 'chr1:1600') - -Column-to-Column Joins ----------------------- - -Join tables on genomic position: - -.. code-block:: sql - - SELECT v.*, g.name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - -Transpiling to SQL ------------------- - -The ``transpile()`` method converts GIQL queries to standard SQL without executing them. -This is useful for debugging, understanding the generated SQL, or integrating with external tools: - -.. code-block:: python - - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Register table schema - engine.register_table_schema( - "variants", - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - }, - genomic_column="interval", - ) - - # Transpile GIQL to SQL - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - - print(sql) - # Output: SELECT * FROM variants WHERE chromosome = 'chr1' AND start_pos < 2000 AND end_pos > 1000 - -Different target dialects generate different SQL: - -.. code-block:: python - - # DuckDB dialect - with GIQLEngine(target_dialect="duckdb") as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Generates DuckDB-optimized SQL - - # SQLite dialect - with GIQLEngine(target_dialect="sqlite") as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Generates SQLite-compatible SQL - -The transpiled SQL can be executed directly on your database or used with other tools. -Use ``verbose=True`` when creating the engine to see detailed transpilation information: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - sql = engine.transpile("SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'") - # Prints detailed information about the transpilation process diff --git a/docs/recipes/advanced-queries.rst b/docs/recipes/advanced-queries.rst index 2aaf944..62147f6 100644 --- a/docs/recipes/advanced-queries.rst +++ b/docs/recipes/advanced-queries.rst @@ -16,16 +16,14 @@ Match Any of Multiple Regions Find features overlapping any of several regions of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS ANY( - 'chr1:1000000-2000000', - 'chr1:5000000-6000000', - 'chr2:1000000-3000000' - ) - """) + SELECT * FROM variants + WHERE interval INTERSECTS ANY( + 'chr1:1000000-2000000', + 'chr1:5000000-6000000', + 'chr2:1000000-3000000' + ) **Use case:** Query multiple regions of interest in a single statement. @@ -34,16 +32,14 @@ Match All of Multiple Points Find features containing all specified positions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM genes - WHERE interval CONTAINS ALL( - 'chr1:1500', - 'chr1:1600', - 'chr1:1700' - ) - """) + SELECT * FROM genes + WHERE interval CONTAINS ALL( + 'chr1:1500', + 'chr1:1600', + 'chr1:1700' + ) **Use case:** Find genes spanning a set of SNP positions. @@ -52,16 +48,14 @@ Exclude Multiple Regions Find features that don't overlap any blacklisted region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM peaks - WHERE NOT interval INTERSECTS ANY( - 'chr1:120000000-125000000', -- Centromere region - 'chr1:140000000-142000000', -- Known artifact - 'chrM:1-16569' -- Mitochondrial - ) - """) + SELECT * FROM peaks + WHERE NOT interval INTERSECTS ANY( + 'chr1:120000000-125000000', -- Centromere region + 'chr1:140000000-142000000', -- Known artifact + 'chrM:1-16569' -- Mitochondrial + ) **Use case:** Filter out features in problematic genomic regions. @@ -70,13 +64,11 @@ Combine ANY and ALL Complex multi-range logic: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM features - WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') - AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') - """) + SELECT * FROM features + WHERE interval INTERSECTS ANY('chr1:1000-2000', 'chr1:5000-6000') + AND interval CONTAINS ALL('chr1:1100', 'chr1:1200') **Use case:** Find features matching complex spatial criteria. @@ -88,18 +80,16 @@ Multi-Attribute Filtering Combine spatial and attribute filters: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name, g.biotype - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND v.filter = 'PASS' - AND v.allele_frequency > 0.01 - AND g.biotype = 'protein_coding' - ORDER BY v.chromosome, v.start_pos - """) + SELECT v.*, g.name AS gene_name, g.biotype + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND v.filter = 'PASS' + AND v.allele_frequency > 0.01 + AND g.biotype = 'protein_coding' + ORDER BY v.chrom, v.start **Use case:** Extract high-quality variants in protein-coding genes. @@ -108,18 +98,16 @@ Target Gene Lists Filter to specific genes of interest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE g.name IN ( - 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS', - 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM' - ) - ORDER BY g.name, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ( + 'BRCA1', 'BRCA2', 'TP53', 'EGFR', 'KRAS', + 'BRAF', 'PIK3CA', 'PTEN', 'APC', 'ATM' + ) + ORDER BY g.name, v.start **Use case:** Extract variants in clinically actionable genes. @@ -128,22 +116,20 @@ Conditional Logic Apply different criteria based on feature type: -.. code-block:: python - - cursor = engine.execute(""" - SELECT v.*, g.name, g.biotype, - CASE - WHEN g.biotype = 'protein_coding' THEN 'coding' - WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA' - ELSE 'other' - END AS gene_category - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE CASE - WHEN g.biotype = 'protein_coding' THEN v.quality >= 30 - ELSE v.quality >= 20 - END - """) +.. code-block:: sql + + SELECT v.*, g.name, g.biotype, + CASE + WHEN g.biotype = 'protein_coding' THEN 'coding' + WHEN g.biotype LIKE '%RNA%' THEN 'noncoding_RNA' + ELSE 'other' + END AS gene_category + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE CASE + WHEN g.biotype = 'protein_coding' THEN v.quality >= 30 + ELSE v.quality >= 20 + END **Use case:** Apply different quality thresholds based on genomic context. @@ -155,19 +141,17 @@ Per-Chromosome Statistics Calculate summary statistics by chromosome: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.chromosome, - COUNT(DISTINCT a.name) AS total_features, - COUNT(b.name) AS total_overlaps, - COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome - ORDER BY a.chromosome - """) + SELECT + a.chrom, + COUNT(DISTINCT a.name) AS total_features, + COUNT(b.name) AS total_overlaps, + COUNT(DISTINCT CASE WHEN b.name IS NOT NULL THEN a.name END) AS features_with_overlap + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom + ORDER BY a.chrom **Use case:** Compare feature distribution across chromosomes. @@ -176,19 +160,17 @@ Overlap Statistics Calculate overlap metrics: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.chromosome, - COUNT(*) AS overlap_count, - AVG(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS avg_overlap_bp, - SUM(LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS total_overlap_bp - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome - ORDER BY a.chromosome - """) + SELECT + a.chrom, + COUNT(*) AS overlap_count, + AVG(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS avg_overlap_bp, + SUM(LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS total_overlap_bp + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom + ORDER BY a.chrom **Use case:** Quantify overlap patterns across the genome. @@ -197,19 +179,17 @@ Feature Size Distribution Analyze feature sizes by category: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - biotype, - COUNT(*) AS count, - AVG(end_pos - start_pos) AS avg_length, - MIN(end_pos - start_pos) AS min_length, - MAX(end_pos - start_pos) AS max_length - FROM genes - GROUP BY biotype - ORDER BY count DESC - """) + SELECT + biotype, + COUNT(*) AS count, + AVG(end - start) AS avg_length, + MIN(end - start) AS min_length, + MAX(end - start) AS max_length + FROM genes + GROUP BY biotype + ORDER BY count DESC **Use case:** Compare size distributions across feature types. @@ -221,14 +201,12 @@ Three-Way Intersection Find features overlapping in all three tables: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - INNER JOIN features_c c ON a.interval INTERSECTS c.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + INNER JOIN features_c c ON a.interval INTERSECTS c.interval **Use case:** Find consensus regions across multiple datasets. @@ -237,19 +215,17 @@ Hierarchical Annotations Join multiple annotation levels: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - v.name AS variant, - e.name AS exon, - t.name AS transcript, - g.name AS gene - FROM variants v - INNER JOIN exons e ON v.interval INTERSECTS e.interval - INNER JOIN transcripts t ON e.interval WITHIN t.interval - INNER JOIN genes g ON t.interval WITHIN g.interval - """) + SELECT + v.name AS variant, + e.name AS exon, + t.name AS transcript, + g.name AS gene + FROM variants v + INNER JOIN exons e ON v.interval INTERSECTS e.interval + INNER JOIN transcripts t ON e.interval WITHIN t.interval + INNER JOIN genes g ON t.interval WITHIN g.interval **Use case:** Build hierarchical annotations for variants. @@ -258,26 +234,24 @@ Union with Deduplication Combine features from multiple sources: -.. code-block:: python - - cursor = engine.execute(""" - WITH all_peaks AS ( - SELECT *, 'chip_seq' AS source FROM chip_peaks - UNION ALL - SELECT *, 'atac_seq' AS source FROM atac_peaks - UNION ALL - SELECT *, 'dnase_seq' AS source FROM dnase_peaks - ) - SELECT - chromosome, - start_pos, - end_pos, - STRING_AGG(DISTINCT source, ',') AS sources, - COUNT(DISTINCT source) AS source_count - FROM all_peaks - GROUP BY chromosome, start_pos, end_pos - HAVING COUNT(DISTINCT source) >= 2 - """) +.. code-block:: sql + + WITH all_peaks AS ( + SELECT *, 'chip_seq' AS source FROM chip_peaks + UNION ALL + SELECT *, 'atac_seq' AS source FROM atac_peaks + UNION ALL + SELECT *, 'dnase_seq' AS source FROM dnase_peaks + ) + SELECT + chrom, + start, + end, + STRING_AGG(DISTINCT source, ',') AS sources, + COUNT(DISTINCT source) AS source_count + FROM all_peaks + GROUP BY chrom, start, end + HAVING COUNT(DISTINCT source) >= 2 **Use case:** Find regulatory regions supported by multiple assays. @@ -289,15 +263,13 @@ Filtered Subquery Use subqueries to pre-filter data: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.* - FROM variants v - WHERE v.interval INTERSECTS ANY( - SELECT position FROM genes WHERE biotype = 'protein_coding' - ) - """) + SELECT v.* + FROM variants v + WHERE v.interval INTERSECTS ANY( + SELECT position FROM genes WHERE biotype = 'protein_coding' + ) **Use case:** Intersect with dynamically filtered reference data. @@ -310,35 +282,33 @@ Chained CTEs Build complex analyses with Common Table Expressions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH - -- Step 1: Find high-quality variants - hq_variants AS ( - SELECT * FROM variants - WHERE quality >= 30 AND filter = 'PASS' - ), - -- Step 2: Annotate with genes - annotated AS ( - SELECT v.*, g.name AS gene_name, g.biotype - FROM hq_variants v - LEFT JOIN genes g ON v.interval INTERSECTS g.interval - ), - -- Step 3: Summarize by gene - gene_summary AS ( - SELECT - gene_name, - biotype, - COUNT(*) AS variant_count - FROM annotated - WHERE gene_name IS NOT NULL - GROUP BY gene_name, biotype - ) - SELECT * FROM gene_summary - ORDER BY variant_count DESC - LIMIT 20 - """) + WITH + -- Step 1: Find high-quality variants + hq_variants AS ( + SELECT * FROM variants + WHERE quality >= 30 AND filter = 'PASS' + ), + -- Step 2: Annotate with genes + annotated AS ( + SELECT v.*, g.name AS gene_name, g.biotype + FROM hq_variants v + LEFT JOIN genes g ON v.interval INTERSECTS g.interval + ), + -- Step 3: Summarize by gene + gene_summary AS ( + SELECT + gene_name, + biotype, + COUNT(*) AS variant_count + FROM annotated + WHERE gene_name IS NOT NULL + GROUP BY gene_name, biotype + ) + SELECT * FROM gene_summary + ORDER BY variant_count DESC + LIMIT 20 **Use case:** Build multi-step analysis pipelines in a single query. @@ -350,22 +320,20 @@ Rank Overlaps Rank features by their overlap characteristics: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name, - a.chromosome, - a.start_pos, - overlap_count, - RANK() OVER (ORDER BY overlap_count DESC) AS rank - FROM ( - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - ) a - """) + SELECT + a.name, + a.chrom, + a.start, + overlap_count, + RANK() OVER (ORDER BY overlap_count DESC) AS rank + FROM ( + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand + ) a **Use case:** Identify features with the most overlaps. @@ -374,21 +342,19 @@ Running Totals Calculate cumulative coverage: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - chromosome, - start_pos, - end_pos, - end_pos - start_pos AS length, - SUM(end_pos - start_pos) OVER ( - PARTITION BY chromosome - ORDER BY start_pos - ) AS cumulative_bp - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + chrom, + start, + end, + end - start AS length, + SUM(end - start) OVER ( + PARTITION BY chrom + ORDER BY start + ) AS cumulative_bp + FROM features + ORDER BY chrom, start **Use case:** Track cumulative coverage along each chromosome. @@ -398,35 +364,20 @@ Debugging and Optimization View Generated SQL ~~~~~~~~~~~~~~~~~~ -Use transpile() to see the SQL GIQL generates: +Use ``transpile()`` to see the SQL GIQL generates: .. code-block:: python - sql = engine.transpile(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) + from giql import transpile + + sql = transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["variants"], + ) print(sql) - # See the actual SQL that will be executed **Use case:** Debug queries or understand GIQL's translation. -Verbose Mode -~~~~~~~~~~~~ - -Enable detailed logging: - -.. code-block:: python - - with GIQLEngine(target_dialect="duckdb", verbose=True) as engine: - # All queries will print transpilation details - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000-2000' - """) - -**Use case:** Diagnose query translation issues. - Explain Query Plan ~~~~~~~~~~~~~~~~~~ @@ -434,16 +385,18 @@ Analyze query execution: .. code-block:: python - # First transpile to get the SQL - sql = engine.transpile(""" + from giql import transpile + + sql = transpile( + """ SELECT v.*, g.name FROM variants v JOIN genes g ON v.interval INTERSECTS g.interval - """) + """, + tables=["variants", "genes"], + ) # Then use database-native EXPLAIN - cursor = engine.execute(f"EXPLAIN {sql}") - for row in cursor: - print(row) + # e.g., conn.execute(f"EXPLAIN {sql}") **Use case:** Optimize slow queries by examining execution plans. diff --git a/docs/recipes/bedtools-migration.rst b/docs/recipes/bedtools-migration.rst index 74c27bd..4a00011 100644 --- a/docs/recipes/bedtools-migration.rst +++ b/docs/recipes/bedtools-migration.rst @@ -20,19 +20,19 @@ Quick Reference Table - GIQL Equivalent - Recipe * - ``intersect -a A -b B`` - - ``SELECT DISTINCT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT DISTINCT a.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-basic` * - ``intersect -a A -b B -wa`` - - ``SELECT a.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT a.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wa` * - ``intersect -a A -b B -wb`` - - ``SELECT b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT b.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wb` * - ``intersect -a A -b B -wa -wb`` - - ``SELECT a.*, b.* FROM a, b WHERE a.pos INTERSECTS b.pos`` + - ``SELECT a.*, b.* FROM a, b WHERE a.interval INTERSECTS b.interval`` - :ref:`intersect-wawb` * - ``intersect -a A -b B -v`` - - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chr IS NULL`` + - ``SELECT a.* FROM a LEFT JOIN b ... WHERE b.chrom IS NULL`` - :ref:`intersect-v` * - ``intersect -a A -b B -u`` - ``SELECT DISTINCT a.* FROM a JOIN b ...`` @@ -47,10 +47,10 @@ Quick Reference Table - ``SELECT a.*, b.* FROM a LEFT JOIN b ...`` - :ref:`intersect-loj` * - ``closest -a A -b B -k N`` - - ``CROSS JOIN LATERAL NEAREST(b, reference=a.pos, k=N)`` + - ``CROSS JOIN LATERAL NEAREST(b, reference=a.interval, k=N)`` - :ref:`closest-k` * - ``closest -a A -b B -d`` - - ``SELECT ..., DISTANCE(a.pos, b.pos) ...`` + - ``SELECT ..., DISTANCE(a.interval, b.interval) ...`` - :ref:`closest-d` * - ``cluster -i A`` - ``SELECT *, CLUSTER(interval) AS cluster_id FROM a`` @@ -84,13 +84,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wa: @@ -105,13 +103,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wb: @@ -126,13 +122,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wawb: @@ -147,13 +141,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-v: @@ -168,14 +160,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL - """) + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chrom IS NULL .. _intersect-u: @@ -190,13 +180,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval .. _intersect-c: @@ -211,14 +199,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - """) + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand .. _intersect-wo: @@ -233,16 +219,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.*, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.*, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval .. _intersect-wao: @@ -257,19 +241,17 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.*, - CASE - WHEN b.chromosome IS NULL THEN 0 - ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - END AS overlap_bp - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.*, + CASE + WHEN b.chrom IS NULL THEN 0 + ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval .. _intersect-loj: @@ -284,13 +266,11 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval ``-s``: Same strand overlaps only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,14 +283,12 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand ``-S``: Opposite strand overlaps only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -323,16 +301,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand != b.strand - AND a.strand IN ('+', '-') - AND b.strand IN ('+', '-') - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') ``-f``: Minimum overlap fraction of A ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -345,16 +321,14 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (a.end_pos - a.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (a.end - a.start) ``-r``: Reciprocal overlap ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -367,23 +341,21 @@ Default: Report overlaps between A and B **GIQL:** -.. code-block:: python - - cursor = engine.execute(""" - WITH overlap_calcs AS ( - SELECT - a.*, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, - (a.end_pos - a.start_pos) AS a_length, - (b.end_pos - b.start_pos) AS b_length - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - ) - SELECT chromosome, start_pos, end_pos, name, score, strand - FROM overlap_calcs - WHERE overlap_bp >= 0.5 * a_length - AND overlap_bp >= 0.5 * b_length - """) +.. code-block:: sql + + WITH overlap_calcs AS ( + SELECT + a.*, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp, + (a.end - a.start) AS a_length, + (b.end - b.start) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT chrom, start, end, name, score, strand + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length bedtools closest ---------------- @@ -401,17 +373,15 @@ bedtools closest **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance .. _closest-d: @@ -426,31 +396,27 @@ bedtools closest **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name AS peak, - b.name AS gene, - DISTANCE(a.interval, b.interval) AS distance - FROM peaks a - CROSS JOIN genes b - WHERE a.chromosome = b.chromosome - ORDER BY a.name, distance - """) + SELECT + a.name AS peak, + b.name AS gene, + DISTANCE(a.interval, b.interval) AS distance + FROM peaks a + CROSS JOIN genes b + WHERE a.chrom = b.chrom + ORDER BY a.name, distance Or using NEAREST for just the closest: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest ``-s``: Same strand only ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -463,22 +429,20 @@ Or using NEAREST for just the closest: **GIQL:** -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance bedtools cluster ---------------- @@ -496,15 +460,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start .. _cluster-d: @@ -519,15 +481,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start ``-s``: Strand-specific clustering ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -540,15 +500,13 @@ Basic clustering **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start bedtools merge -------------- @@ -566,12 +524,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features .. _merge-d: @@ -586,12 +542,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features ``-s``: Strand-specific merge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -604,12 +558,10 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features .. _merge-count: @@ -624,14 +576,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features ``-c -o mean``: Average score ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -644,14 +594,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - AVG(score) AS avg_score - FROM features - """) + SELECT + MERGE(interval), + AVG(score) AS avg_score + FROM features ``-c -o collapse``: Collect names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -664,14 +612,12 @@ Basic merge **GIQL:** -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS feature_names - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS feature_names + FROM features Key Differences from Bedtools ----------------------------- diff --git a/docs/recipes/clustering-queries.rst b/docs/recipes/clustering-queries.rst index 6ff1487..3dbd682 100644 --- a/docs/recipes/clustering-queries.rst +++ b/docs/recipes/clustering-queries.rst @@ -16,15 +16,13 @@ Assign Cluster IDs Assign unique cluster IDs to groups of overlapping intervals: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval) AS cluster_id + FROM features + ORDER BY chrom, start **Use case:** Group overlapping peaks or annotations for downstream analysis. @@ -33,21 +31,19 @@ View Cluster Assignments See which features belong to which cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - cluster_id, - chromosome, - name, - start_pos, - end_pos - FROM ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - ORDER BY cluster_id, start_pos - """) + SELECT + cluster_id, + chrom, + name, + start, + end + FROM ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + ORDER BY cluster_id, start **Use case:** Inspect clustering results to understand feature groupings. @@ -59,15 +55,13 @@ Cluster with Gap Tolerance Cluster intervals that are within a specified distance of each other: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000) AS cluster_id - FROM features - ORDER BY chromosome, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000) AS cluster_id + FROM features + ORDER BY chrom, start **Use case:** Group nearby features even if they don't directly overlap (e.g., cluster peaks within 1kb of each other). @@ -77,22 +71,16 @@ Variable Distance Thresholds Experiment with different clustering distances: -.. code-block:: python +.. code-block:: sql - # Tight clustering (overlapping only) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features - """) + -- Tight clustering (overlapping only) + SELECT *, CLUSTER(interval, 0) AS tight_cluster FROM features - # Medium clustering (within 500bp) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features - """) + -- Medium clustering (within 500bp) + SELECT *, CLUSTER(interval, 500) AS medium_cluster FROM features - # Loose clustering (within 5kb) - cursor = engine.execute(""" - SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features - """) + -- Loose clustering (within 5kb) + SELECT *, CLUSTER(interval, 5000) AS loose_cluster FROM features **Use case:** Compare clustering at different resolutions for sensitivity analysis. @@ -104,15 +92,13 @@ Cluster by Strand Cluster intervals separately for each strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Use case:** Maintain strand separation when clustering transcripts or strand-specific regulatory elements. @@ -122,15 +108,13 @@ Strand-Specific with Distance Combine strand awareness with distance tolerance: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - *, - CLUSTER(interval, 1000, stranded=true) AS cluster_id - FROM features - ORDER BY chromosome, strand, start_pos - """) + SELECT + *, + CLUSTER(interval, 1000, stranded=true) AS cluster_id + FROM features + ORDER BY chrom, strand, start **Use case:** Cluster nearby same-strand features while keeping opposite strands separate. @@ -143,23 +127,21 @@ Count Features per Cluster Calculate how many features are in each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - chromosome, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end - FROM clustered - GROUP BY cluster_id, chromosome - ORDER BY chromosome, cluster_start - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chrom, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end + FROM clustered + GROUP BY cluster_id, chrom + ORDER BY chrom, cluster_start **Use case:** Identify cluster sizes and boundaries. @@ -168,24 +150,22 @@ Filter by Cluster Size Find clusters with a minimum number of features: -.. code-block:: python - - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ), - cluster_sizes AS ( - SELECT cluster_id, COUNT(*) AS size - FROM clustered - GROUP BY cluster_id - ) - SELECT c.* - FROM clustered c - JOIN cluster_sizes s ON c.cluster_id = s.cluster_id - WHERE s.size >= 3 - ORDER BY c.cluster_id, c.start_pos - """) +.. code-block:: sql + + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ), + cluster_sizes AS ( + SELECT cluster_id, COUNT(*) AS size + FROM clustered + GROUP BY cluster_id + ) + SELECT c.* + FROM clustered c + JOIN cluster_sizes s ON c.cluster_id = s.cluster_id + WHERE s.size >= 3 + ORDER BY c.cluster_id, c.start **Use case:** Focus on regions with multiple overlapping features (hotspots). @@ -194,26 +174,24 @@ Cluster Summary Statistics Calculate statistics for each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - chromosome, - COUNT(*) AS feature_count, - MIN(start_pos) AS cluster_start, - MAX(end_pos) AS cluster_end, - MAX(end_pos) - MIN(start_pos) AS cluster_span, - AVG(score) AS avg_score, - MAX(score) AS max_score - FROM clustered - GROUP BY cluster_id, chromosome - ORDER BY feature_count DESC - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + chrom, + COUNT(*) AS feature_count, + MIN(start) AS cluster_start, + MAX(end) AS cluster_end, + MAX(end) - MIN(start) AS cluster_span, + AVG(score) AS avg_score, + MAX(score) AS max_score + FROM clustered + GROUP BY cluster_id, chrom + ORDER BY feature_count DESC **Use case:** Rank clusters by size, span, or aggregate scores. @@ -225,12 +203,10 @@ Merge Overlapping Intervals Combine overlapping intervals into unified regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval) - FROM features - """) + SELECT MERGE(interval) + FROM features **Use case:** Create non-overlapping consensus regions from redundant annotations. @@ -239,12 +215,10 @@ Merge with Distance Merge intervals within a specified distance: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, 1000) - FROM features - """) + SELECT MERGE(interval, 1000) + FROM features **Use case:** Create broader regions by joining nearby features. @@ -253,12 +227,10 @@ Strand-Specific Merge Merge intervals separately by strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT MERGE(interval, stranded=true) - FROM features - """) + SELECT MERGE(interval, stranded=true) + FROM features **Use case:** Create strand-aware consensus regions. @@ -270,14 +242,12 @@ Count Merged Features Track how many features were merged into each region: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count + FROM features **Use case:** Understand the complexity of each merged region. @@ -286,17 +256,15 @@ Aggregate Scores Calculate statistics for merged regions: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - COUNT(*) AS feature_count, - AVG(score) AS avg_score, - MAX(score) AS max_score, - SUM(score) AS total_score - FROM features - """) + SELECT + MERGE(interval), + COUNT(*) AS feature_count, + AVG(score) AS avg_score, + MAX(score) AS max_score, + SUM(score) AS total_score + FROM features **Use case:** Summarize signal intensity across merged regions. @@ -305,14 +273,12 @@ Collect Feature Names List the names of features that were merged: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - MERGE(interval), - STRING_AGG(name, ',') AS merged_features - FROM features - """) + SELECT + MERGE(interval), + STRING_AGG(name, ',') AS merged_features + FROM features **Use case:** Track provenance of merged regions. @@ -324,16 +290,14 @@ Total Base Pair Coverage Calculate total genomic coverage after merging: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) - FROM features - ) - SELECT SUM(end_pos - start_pos) AS total_coverage_bp - FROM merged - """) + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT SUM(end - start) AS total_coverage_bp + FROM merged **Use case:** Calculate the total genome fraction covered by features. @@ -342,21 +306,19 @@ Coverage per Chromosome Calculate coverage for each chromosome: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH merged AS ( - SELECT MERGE(interval) - FROM features - ) - SELECT - chromosome, - COUNT(*) AS region_count, - SUM(end_pos - start_pos) AS coverage_bp - FROM merged - GROUP BY chromosome - ORDER BY chromosome - """) + WITH merged AS ( + SELECT MERGE(interval) + FROM features + ) + SELECT + chrom, + COUNT(*) AS region_count, + SUM(end - start) AS coverage_bp + FROM merged + GROUP BY chrom + ORDER BY chrom **Use case:** Compare feature density across chromosomes. @@ -365,29 +327,27 @@ Coverage Reduction Compare raw vs merged coverage: -.. code-block:: python - - cursor = engine.execute(""" - WITH raw_stats AS ( - SELECT - COUNT(*) AS raw_count, - SUM(end_pos - start_pos) AS raw_bp - FROM features - ), - merged_stats AS ( - SELECT - COUNT(*) AS merged_count, - SUM(end_pos - start_pos) AS merged_bp - FROM (SELECT MERGE(interval) FROM features) - ) +.. code-block:: sql + + WITH raw_stats AS ( + SELECT + COUNT(*) AS raw_count, + SUM(end - start) AS raw_bp + FROM features + ), + merged_stats AS ( SELECT - raw_count, - merged_count, - raw_bp, - merged_bp, - ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct - FROM raw_stats, merged_stats - """) + COUNT(*) AS merged_count, + SUM(end - start) AS merged_bp + FROM (SELECT MERGE(interval) FROM features) + ) + SELECT + raw_count, + merged_count, + raw_bp, + merged_bp, + ROUND(100.0 * merged_bp / raw_bp, 2) AS coverage_retained_pct + FROM raw_stats, merged_stats **Use case:** Quantify the redundancy in your feature set. @@ -399,24 +359,22 @@ Cluster Then Merge First cluster features, then analyze each cluster: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH clustered AS ( - SELECT *, CLUSTER(interval) AS cluster_id - FROM features - ) - SELECT - cluster_id, - MIN(chromosome) AS chromosome, - MIN(start_pos) AS start_pos, - MAX(end_pos) AS end_pos, - COUNT(*) AS feature_count, - STRING_AGG(name, ',') AS features - FROM clustered - GROUP BY cluster_id - ORDER BY chromosome, start_pos - """) + WITH clustered AS ( + SELECT *, CLUSTER(interval) AS cluster_id + FROM features + ) + SELECT + cluster_id, + MIN(chrom) AS chrom, + MIN(start) AS start, + MAX(end) AS end, + COUNT(*) AS feature_count, + STRING_AGG(name, ',') AS features + FROM clustered + GROUP BY cluster_id + ORDER BY chrom, start **Use case:** Alternative to MERGE that preserves cluster identifiers. @@ -425,26 +383,24 @@ Hierarchical Clustering Apply multiple clustering levels: -.. code-block:: python - - cursor = engine.execute(""" - WITH level1 AS ( - SELECT *, CLUSTER(interval, 0) AS cluster_l1 - FROM features - ), - level2 AS ( - SELECT *, CLUSTER(interval, 1000) AS cluster_l2 - FROM level1 - ) - SELECT - cluster_l1, - cluster_l2, - chromosome, - name, - start_pos, - end_pos - FROM level2 - ORDER BY cluster_l2, cluster_l1, start_pos - """) +.. code-block:: sql + + WITH level1 AS ( + SELECT *, CLUSTER(interval, 0) AS cluster_l1 + FROM features + ), + level2 AS ( + SELECT *, CLUSTER(interval, 1000) AS cluster_l2 + FROM level1 + ) + SELECT + cluster_l1, + cluster_l2, + chrom, + name, + start, + end + FROM level2 + ORDER BY cluster_l2, cluster_l1, start **Use case:** Analyze feature relationships at multiple scales. diff --git a/docs/recipes/distance-queries.rst b/docs/recipes/distance-queries.rst index 41f9ede..c71a4ee 100644 --- a/docs/recipes/distance-queries.rst +++ b/docs/recipes/distance-queries.rst @@ -16,24 +16,22 @@ Distance Between Feature Pairs Calculate the distance between features in two tables: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name AS feature_a, - b.name AS feature_b, - DISTANCE(a.interval, b.interval) AS distance - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - ORDER BY a.name, distance - """) + SELECT + a.name AS feature_a, + b.name AS feature_b, + DISTANCE(a.interval, b.interval) AS distance + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + ORDER BY a.name, distance **Use case:** Generate a distance matrix between regulatory elements and genes. .. note:: - Always include ``WHERE a.chromosome = b.chromosome`` to avoid comparing + Always include ``WHERE a.chrom = b.chrom`` to avoid comparing features on different chromosomes (which returns NULL for distance). Identify Overlapping vs Proximal @@ -41,23 +39,21 @@ Identify Overlapping vs Proximal Classify relationships based on distance: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - p.name AS peak, - g.name AS gene, - DISTANCE(p.interval, g.interval) AS dist, - CASE - WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' - WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)' - WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)' - ELSE 'distant' - END AS relationship - FROM peaks p - CROSS JOIN genes g - WHERE p.chromosome = g.chromosome - """) +.. code-block:: sql + + SELECT + p.name AS peak, + g.name AS gene, + DISTANCE(p.interval, g.interval) AS dist, + CASE + WHEN DISTANCE(p.interval, g.interval) = 0 THEN 'overlapping' + WHEN DISTANCE(p.interval, g.interval) <= 1000 THEN 'proximal (<1kb)' + WHEN DISTANCE(p.interval, g.interval) <= 10000 THEN 'nearby (<10kb)' + ELSE 'distant' + END AS relationship + FROM peaks p + CROSS JOIN genes g + WHERE p.chrom = g.chrom **Use case:** Categorize peak-gene relationships for enhancer analysis. @@ -66,19 +62,17 @@ Filter by Maximum Distance Find feature pairs within a distance threshold: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.name, - b.name, - DISTANCE(a.interval, b.interval) AS dist - FROM features_a a - CROSS JOIN features_b b - WHERE a.chromosome = b.chromosome - AND DISTANCE(a.interval, b.interval) <= 50000 - ORDER BY dist - """) + SELECT + a.name, + b.name, + DISTANCE(a.interval, b.interval) AS dist + FROM features_a a + CROSS JOIN features_b b + WHERE a.chrom = b.chrom + AND DISTANCE(a.interval, b.interval) <= 50000 + ORDER BY dist **Use case:** Find regulatory elements within 50kb of genes. @@ -90,17 +84,15 @@ Find K Nearest Features For each peak, find the 3 nearest genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest - ORDER BY peaks.name, nearest.distance - """) + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Annotate ChIP-seq peaks with nearby genes. @@ -109,13 +101,11 @@ Nearest Feature to a Specific Location Find the 5 nearest genes to a specific genomic coordinate: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT name, distance - FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) - ORDER BY distance - """) + SELECT name, distance + FROM NEAREST(genes, reference='chr1:1000000-1001000', k=5) + ORDER BY distance **Use case:** Explore the genomic neighborhood of a position of interest. @@ -124,22 +114,20 @@ Nearest with Distance Constraint Find nearest features within a maximum distance: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=100000 - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=100000 + ) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Find regulatory targets within 100kb, ignoring distant genes. @@ -151,23 +139,21 @@ Same-Strand Nearest Neighbors Find nearest features on the same strand only: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.strand, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=3, - stranded=true - ) AS nearest - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.strand, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=3, + stranded=true + ) AS nearest + ORDER BY peaks.name, nearest.distance **Use case:** Find same-strand genes for strand-specific regulatory analysis. @@ -179,23 +165,21 @@ Upstream Features Find features upstream (5') of reference positions using signed distances: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance < 0 - ORDER BY peaks.name, nearest.distance DESC - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance < 0 + ORDER BY peaks.name, nearest.distance DESC **Use case:** Find genes upstream of regulatory elements. @@ -209,23 +193,21 @@ Downstream Features Find features downstream (3') of reference positions: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance > 0 - ORDER BY peaks.name, nearest.distance - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance > 0 + ORDER BY peaks.name, nearest.distance **Use case:** Identify downstream targets of promoter elements. @@ -234,23 +216,21 @@ Promoter-Proximal Analysis Find features within a specific distance window around the reference: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=10, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -2000 AND 500 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=10, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -2000 AND 500 + ORDER BY peaks.name, ABS(nearest.distance) **Use case:** Find genes with peaks in their promoter regions (-2kb to +500bp from TSS). @@ -262,25 +242,23 @@ Strand-Specific with Distance Constraint Find nearby same-strand features: -.. code-block:: python - - cursor = engine.execute(""" - SELECT - peaks.name AS peak, - nearest.name AS gene, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST( - genes, - reference=peaks.interval, - k=5, - max_distance=50000, - stranded=true, - signed=true - ) AS nearest - WHERE nearest.distance BETWEEN -10000 AND 10000 - ORDER BY peaks.name, ABS(nearest.distance) - """) +.. code-block:: sql + + SELECT + peaks.name AS peak, + nearest.name AS gene, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, + reference=peaks.interval, + k=5, + max_distance=50000, + stranded=true, + signed=true + ) AS nearest + WHERE nearest.distance BETWEEN -10000 AND 10000 + ORDER BY peaks.name, ABS(nearest.distance) **Use case:** Find same-strand genes within ±10kb for promoter-enhancer analysis. @@ -292,23 +270,21 @@ Average Distance to Nearest Gene Calculate the average distance from peaks to their nearest gene: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - WITH nearest_genes AS ( - SELECT - peaks.name AS peak, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - ) + WITH nearest_genes AS ( SELECT - COUNT(*) AS peak_count, - AVG(distance) AS avg_distance, - MIN(distance) AS min_distance, - MAX(distance) AS max_distance - FROM nearest_genes - """) + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance, + MIN(distance) AS min_distance, + MAX(distance) AS max_distance + FROM nearest_genes **Use case:** Characterize the genomic distribution of peaks relative to genes. @@ -317,25 +293,23 @@ Distance Distribution by Chromosome Analyze distance patterns per chromosome: -.. code-block:: python - - cursor = engine.execute(""" - WITH nearest_genes AS ( - SELECT - peaks.chromosome, - peaks.name AS peak, - nearest.distance - FROM peaks - CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest - ) +.. code-block:: sql + + WITH nearest_genes AS ( SELECT - chromosome, - COUNT(*) AS peak_count, - AVG(distance) AS avg_distance - FROM nearest_genes - GROUP BY chromosome - ORDER BY chromosome - """) + peaks.chrom, + peaks.name AS peak, + nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest + ) + SELECT + chrom, + COUNT(*) AS peak_count, + AVG(distance) AS avg_distance + FROM nearest_genes + GROUP BY chrom + ORDER BY chrom **Use case:** Compare regulatory element distribution across chromosomes. @@ -347,26 +321,24 @@ Expand Search Window Find features within an expanded window around each feature: -.. code-block:: python - - cursor = engine.execute(""" - WITH expanded AS ( - SELECT - name, - chromosome, - start_pos - 5000 AS search_start, - end_pos + 5000 AS search_end - FROM peaks - ) +.. code-block:: sql + + WITH expanded AS ( SELECT - e.name AS peak, - b.* - FROM expanded e - JOIN features_b b - ON b.chromosome = e.chromosome - AND b.start_pos < e.search_end - AND b.end_pos > e.search_start - """) + name, + chrom, + start - 5000 AS search_start, + end + 5000 AS search_end + FROM peaks + ) + SELECT + e.name AS peak, + b.* + FROM expanded e + JOIN features_b b + ON b.chrom = e.chrom + AND b.start < e.search_end + AND b.end > e.search_start **Use case:** Find all features within 5kb flanking regions. diff --git a/docs/recipes/index.rst b/docs/recipes/index.rst index f5d7a2c..5597846 100644 --- a/docs/recipes/index.rst +++ b/docs/recipes/index.rst @@ -11,34 +11,21 @@ using GIQL. Each recipe focuses on a specific use case with ready-to-use query p Getting Started with Recipes ---------------------------- -All recipes assume you have set up a GIQL engine and registered your table schemas: +All recipes show GIQL queries that you can transpile and execute on your database. +Setup: .. code-block:: python - from giql import GIQLEngine - - with GIQLEngine(target_dialect="duckdb") as engine: - # Load your data - engine.load_csv("features_a", "file_a.bed") - engine.load_csv("features_b", "file_b.bed") - - # Register schemas with genomic column mapping - for table in ["features_a", "features_b"]: - engine.register_table_schema( - table, - { - "chromosome": "VARCHAR", - "start_pos": "BIGINT", - "end_pos": "BIGINT", - "name": "VARCHAR", - "score": "FLOAT", - "strand": "VARCHAR", - }, - genomic_column="interval", - ) - - # Now run queries from the recipes below - cursor = engine.execute("...") + from giql import transpile + + # Transpile any GIQL query to SQL + sql = transpile( + "... GIQL query from the recipes below ...", + tables=["features_a", "features_b"], + ) + + # Then execute the SQL on your database connection + # e.g., conn.execute(sql) Recipe Categories ----------------- diff --git a/docs/recipes/intersect-queries.rst b/docs/recipes/intersect-queries.rst index fee0324..ef7c022 100644 --- a/docs/recipes/intersect-queries.rst +++ b/docs/recipes/intersect-queries.rst @@ -16,13 +16,11 @@ Basic Overlap Query Find all features in table A that overlap with any feature in table B: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Identify variants that fall within gene regions. @@ -32,13 +30,11 @@ Get All Overlap Pairs Return every pair of overlapping features (may produce duplicates if one feature overlaps multiple others): -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT a.*, b.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Generate a full overlap matrix for downstream analysis. @@ -47,12 +43,10 @@ Query Against a Specific Region Find features overlapping a literal genomic range: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT * FROM variants - WHERE interval INTERSECTS 'chr1:1000000-2000000' - """) + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000000-2000000' **Use case:** Extract all data for a specific chromosomal region. @@ -64,14 +58,12 @@ Excluding Overlaps Find features in A that do NOT overlap with any feature in B: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - WHERE b.chromosome IS NULL - """) + SELECT a.* + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + WHERE b.chrom IS NULL **Use case:** Find regulatory regions that don't overlap with known genes, or identify variants outside of exonic regions. @@ -81,13 +73,11 @@ Features with Any Overlap (Unique) Return each feature from A only once, regardless of how many B features it overlaps: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Get a deduplicated list of features that have at least one overlap. @@ -99,14 +89,12 @@ Count Overlapping Features Count how many B features each A feature overlaps: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, COUNT(b.name) AS overlap_count - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - """) + SELECT a.*, COUNT(b.name) AS overlap_count + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand **Use case:** Calculate how many enhancers each gene overlaps with, or count variants per feature. @@ -116,15 +104,13 @@ Filter by Overlap Count Find features that overlap at least N other features: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a - INNER JOIN features_b b ON a.interval INTERSECTS b.interval - GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand - HAVING COUNT(*) >= 3 - """) + SELECT a.* + FROM features_a a + INNER JOIN features_b b ON a.interval INTERSECTS b.interval + GROUP BY a.chrom, a.start, a.end, a.name, a.score, a.strand + HAVING COUNT(*) >= 3 **Use case:** Identify hotspot regions with high feature density. @@ -136,14 +122,12 @@ Same-Strand Overlaps Find overlapping features on the same strand: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS b_name - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand = b.strand - """) + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand **Use case:** Find sense-strand overlaps for transcript analysis. @@ -152,16 +136,14 @@ Opposite-Strand Overlaps Find overlapping features on opposite strands: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS b_name - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND a.strand != b.strand - AND a.strand IN ('+', '-') - AND b.strand IN ('+', '-') - """) + SELECT a.*, b.name AS b_name + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + AND a.strand IN ('+', '-') + AND b.strand IN ('+', '-') **Use case:** Identify antisense overlaps or convergent transcription. @@ -173,16 +155,14 @@ Minimum Overlap Fraction of A Find overlaps where at least 50% of feature A is covered: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (a.end_pos - a.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (a.end - a.start) **Use case:** Ensure substantial overlap rather than just touching edges. @@ -191,16 +171,14 @@ Minimum Overlap Fraction of B Find overlaps where at least 50% of feature B is covered: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.* - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - AND ( - LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - ) >= 0.5 * (b.end_pos - b.start_pos) - """) + SELECT a.* + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + AND ( + LEAST(a.end, b.end) - GREATEST(a.start, b.start) + ) >= 0.5 * (b.end - b.start) **Use case:** Find features that substantially cover smaller annotations. @@ -209,24 +187,22 @@ Reciprocal Overlap Require both features to have at least 50% mutual overlap: -.. code-block:: python - - cursor = engine.execute(""" - WITH overlap_calcs AS ( - SELECT - a.*, - b.name AS b_name, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp, - (a.end_pos - a.start_pos) AS a_length, - (b.end_pos - b.start_pos) AS b_length - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - ) - SELECT * - FROM overlap_calcs - WHERE overlap_bp >= 0.5 * a_length - AND overlap_bp >= 0.5 * b_length - """) +.. code-block:: sql + + WITH overlap_calcs AS ( + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp, + (a.end - a.start) AS a_length, + (b.end - b.start) AS b_length + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval + ) + SELECT * + FROM overlap_calcs + WHERE overlap_bp >= 0.5 * a_length + AND overlap_bp >= 0.5 * b_length **Use case:** Find high-confidence overlaps where features mutually cover each other. @@ -238,13 +214,11 @@ Left Outer Join Report all features from A, with B information where available: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT a.*, b.name AS overlapping_feature - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT a.*, b.name AS overlapping_feature + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Annotate features with overlap information while keeping all records. @@ -253,16 +227,14 @@ Calculate Overlap Amount Return the overlap size in base pairs: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.name AS b_name, - (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp - FROM features_a a, features_b b - WHERE a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.name AS b_name, + (LEAST(a.end, b.end) - GREATEST(a.start, b.start)) AS overlap_bp + FROM features_a a, features_b b + WHERE a.interval INTERSECTS b.interval **Use case:** Quantify the extent of each overlap. @@ -271,19 +243,17 @@ Overlap with NULL Handling Report overlap amount for all A features, with 0 for non-overlapping: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT - a.*, - b.name AS b_name, - CASE - WHEN b.chromosome IS NULL THEN 0 - ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos) - END AS overlap_bp - FROM features_a a - LEFT JOIN features_b b ON a.interval INTERSECTS b.interval - """) + SELECT + a.*, + b.name AS b_name, + CASE + WHEN b.chrom IS NULL THEN 0 + ELSE LEAST(a.end, b.end) - GREATEST(a.start, b.start) + END AS overlap_bp + FROM features_a a + LEFT JOIN features_b b ON a.interval INTERSECTS b.interval **Use case:** Create a complete overlap report including non-overlapping features. @@ -295,26 +265,18 @@ Union Multiple Sources Intersect A with features from multiple B tables: -.. code-block:: python - - # Load and register multiple tables first - engine.load_csv("features_b1", "file1.bed") - engine.load_csv("features_b2", "file2.bed") - engine.load_csv("features_b3", "file3.bed") - # Register schemas for each... - - cursor = engine.execute(""" - WITH all_b_features AS ( - SELECT * FROM features_b1 - UNION ALL - SELECT * FROM features_b2 - UNION ALL - SELECT * FROM features_b3 - ) - SELECT DISTINCT a.* - FROM features_a a - INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval - """) +.. code-block:: sql + + WITH all_b_features AS ( + SELECT * FROM features_b1 + UNION ALL + SELECT * FROM features_b2 + UNION ALL + SELECT * FROM features_b3 + ) + SELECT DISTINCT a.* + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval **Use case:** Find features overlapping any region from multiple annotation sources. @@ -323,20 +285,18 @@ Track Overlap Source Know which source table each overlap came from: -.. code-block:: python - - cursor = engine.execute(""" - WITH all_b_features AS ( - SELECT *, 'source1' AS source FROM features_b1 - UNION ALL - SELECT *, 'source2' AS source FROM features_b2 - UNION ALL - SELECT *, 'source3' AS source FROM features_b3 - ) - SELECT a.*, b.name AS overlap_name, b.source - FROM features_a a - INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval - """) +.. code-block:: sql + + WITH all_b_features AS ( + SELECT *, 'source1' AS source FROM features_b1 + UNION ALL + SELECT *, 'source2' AS source FROM features_b2 + UNION ALL + SELECT *, 'source3' AS source FROM features_b3 + ) + SELECT a.*, b.name AS overlap_name, b.source + FROM features_a a + INNER JOIN all_b_features b ON a.interval INTERSECTS b.interval **Use case:** Track which annotation database each overlap originated from. @@ -348,16 +308,14 @@ Overlap with Quality Filters Combine spatial and attribute filters: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE v.quality >= 30 - AND g.biotype = 'protein_coding' - ORDER BY v.chromosome, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + AND g.biotype = 'protein_coding' + ORDER BY v.chrom, v.start **Use case:** Find high-quality variants in protein-coding genes. @@ -366,14 +324,12 @@ Specific Target Genes Find overlaps with a specific set of genes: -.. code-block:: python +.. code-block:: sql - cursor = engine.execute(""" - SELECT v.*, g.name AS gene_name - FROM variants v - INNER JOIN genes g ON v.interval INTERSECTS g.interval - WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR') - ORDER BY g.name, v.start_pos - """) + SELECT v.*, g.name AS gene_name + FROM variants v + INNER JOIN genes g ON v.interval INTERSECTS g.interval + WHERE g.name IN ('BRCA1', 'BRCA2', 'TP53', 'EGFR') + ORDER BY g.name, v.start **Use case:** Extract variants in clinically relevant genes. diff --git a/docs/transpilation/api-reference.rst b/docs/transpilation/api-reference.rst new file mode 100644 index 0000000..fcba984 --- /dev/null +++ b/docs/transpilation/api-reference.rst @@ -0,0 +1,13 @@ +API Reference +============= + +.. currentmodule:: giql + +.. autosummary:: + + transpile + Table + +.. autofunction:: transpile + +.. autoclass:: Table diff --git a/docs/transpilation/execution.rst b/docs/transpilation/execution.rst new file mode 100644 index 0000000..72ea9de --- /dev/null +++ b/docs/transpilation/execution.rst @@ -0,0 +1,152 @@ +Execution +========= + +How to use transpiled SQL +------------------------- + +You can write queries in the GIQL dialect and execute them on any SQL-92 +compliant database or analytics engine, without needing native GIQL support. + +With external database connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use transpiled SQL with your own database connections: + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + conn = duckdb.connect("my_database.duckdb") + result = conn.execute(sql).fetchall() + conn.close() + +With ORMs and query builders +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Integrate transpiled SQL with SQLAlchemy or other ORMs: + +.. code-block:: python + + from sqlalchemy import create_engine, text + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + engine = create_engine("duckdb:///my_database.duckdb") + with engine.connect() as conn: + result = conn.execute(text(sql)) + for row in result: + print(row) + +Building SQL pipelines +~~~~~~~~~~~~~~~~~~~~~~ + +Use transpilation in data pipelines: + +.. code-block:: python + + from giql import transpile + + def build_intersection_query(table_a, table_b, region): + """Generate SQL for intersection query.""" + return transpile( + f""" + SELECT a.*, b.name + FROM {table_a} a + JOIN {table_b} b ON a.interval INTERSECTS b.interval + WHERE a.interval INTERSECTS '{region}' + """, + tables=[table_a, table_b], + ) + + # Use in pipeline + sql = build_intersection_query("variants", "genes", "chr1:1000000-2000000") + # Execute sql with your preferred method + +Saving queries +~~~~~~~~~~~~~~ + +Save transpiled SQL for documentation or reuse: + +.. code-block:: python + + import duckdb + from giql import transpile + + sql = transpile( + """ + SELECT * FROM variants + WHERE interval INTERSECTS 'chr1:1000-2000' + """, + tables=["variants"], + ) + + with open("query.sql", "w") as f: + f.write(sql) + + # Later, execute saved SQL + with open("query.sql") as f: + sql = f.read() + + conn = duckdb.connect("database.duckdb") + result = conn.execute(sql).fetchall() + +Parameterized queries +~~~~~~~~~~~~~~~~~~~~~ + +Build queries with parameters: + +.. code-block:: python + + from giql import transpile + + def query_region(chrom, start, end): + """Transpile a parameterized region query.""" + region = f"{chrom}:{start}-{end}" + return transpile( + f""" + SELECT * FROM variants + WHERE interval INTERSECTS '{region}' + """, + tables=["variants"], + ) + + # Use with different regions + sql = query_region("chr1", 1000000, 2000000) + sql = query_region("chr2", 5000000, 6000000) + +Dynamic query building +~~~~~~~~~~~~~~~~~~~~~~ + +Build queries programmatically: + +.. code-block:: python + + from giql import transpile + + def build_multi_table_query(tables, target_region): + """Build a query that unions results from multiple tables.""" + union_parts = [] + for table in tables: + union_parts.append(f""" + SELECT *, '{table}' AS source FROM {table} + WHERE interval INTERSECTS '{target_region}' + """) + + query = " UNION ALL ".join(union_parts) + return transpile(query, tables=list(tables)) diff --git a/docs/transpilation/index.rst b/docs/transpilation/index.rst new file mode 100644 index 0000000..e5e743b --- /dev/null +++ b/docs/transpilation/index.rst @@ -0,0 +1,210 @@ +Transpilation +============= + +The ``giql`` Python package transpiles GIQL into SQL. + +How it works +------------ + +When you do this: + +.. code-block:: python + + from giql import transpile + + sql = transpile( + "SELECT * FROM variants WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["variants"], + ) + + print(sql) + +The transpiler performs three main steps: + +1. **Parses** the GIQL query into an abstract syntax tree (AST) to identify GIQL-specific operators +2. **Transforms** genomic operators into SQL predicates and Common Table Expressions (CTEs), and replace genomic pseudo-columns with actual column references +3. **Generates** SQL output from the modified AST + +The result is a standard SQL query that can be consumed by an execution engine that is not genome-aware. + +.. code-block:: sql + + SELECT * FROM variants + WHERE "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 + + +Examples +-------- + +Each GIQL operator expands to specific SQL patterns. + +**INTERSECTS** expands to range overlap checks: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + a.interval INTERSECTS b.interval + + .. tab-item:: SQL + + .. code-block:: sql + + a."chrom" = b."chrom" + AND a."start" < b."end" + AND a."end" > b."start" + +**CONTAINS** expands to containment checks: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + a.interval CONTAINS b.interval + + .. tab-item:: SQL + + .. code-block:: sql + + a."chrom" = b."chrom" + AND a."start" <= b."start" + AND a."end" >= b."end" + +**DISTANCE** expands to gap calculations: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + DISTANCE(a.interval, b.interval) + + .. tab-item:: SQL + + .. code-block:: sql + + CASE + WHEN a."chrom" != b."chrom" THEN NULL + WHEN a."end" <= b."start" THEN b."start" - a."end" + WHEN b."end" <= a."start" THEN a."start" - b."end" + ELSE 0 + END + +**Intersection joins** expand to inequality joins: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT v.*, g.name AS gene_name + FROM variants v + JOIN genes g ON v.interval INTERSECTS g.interval + WHERE v.quality >= 30 + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT v.*, g.name AS gene_name + FROM variants AS v + JOIN genes AS g + ON v."chrom" = g."chrom" + AND v."start" < g."end" + AND v."end" > g."start" + WHERE v.quality >= 30 + +**NEAREST** expands to lateral subqueries: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL NEAREST( + genes, reference=peaks.interval, k=5 + ) AS nearest + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT peaks.name, nearest.name, nearest.distance + FROM peaks + CROSS JOIN LATERAL ( + SELECT + genes.*, + CASE + WHEN peaks."chrom" != genes."chrom" THEN NULL + WHEN peaks."start" < genes."end" + AND peaks."end" > genes."start" THEN 0 + WHEN peaks."end" <= genes."start" + THEN genes."start" - peaks."end" + ELSE peaks."start" - genes."end" + END AS distance + FROM genes + WHERE peaks."chrom" = genes."chrom" + ORDER BY ABS( + CASE + WHEN peaks."chrom" != genes."chrom" THEN NULL + WHEN peaks."start" < genes."end" + AND peaks."end" > genes."start" THEN 0 + WHEN peaks."end" <= genes."start" + THEN genes."start" - peaks."end" + ELSE peaks."start" - genes."end" + END + ) + LIMIT 5 + ) AS nearest + +**MERGE** expands to window-function-based clustering: + +.. tab-set:: + + .. tab-item:: GIQL + + .. code-block:: sql + + SELECT MERGE(interval), COUNT(*) AS count + FROM features + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT + "chrom", + MIN("start") AS start, + MAX("end") AS end, + COUNT(*) AS count + FROM ( + SELECT + *, + SUM(is_new_cluster) OVER ( + PARTITION BY "chrom" + ORDER BY "start" NULLS LAST + ) AS __giql_cluster_id + FROM ( + SELECT + *, + CASE + WHEN LAG("end") OVER ( + PARTITION BY "chrom" + ORDER BY "start" NULLS LAST + ) >= "start" THEN 0 + ELSE 1 + END AS is_new_cluster + FROM features + ) AS lag_calc + ) AS clustered + GROUP BY chrom, __giql_cluster_id + ORDER BY "chrom" NULLS LAST, "start" NULLS LAST diff --git a/src/giql/__init__.py b/src/giql/__init__.py index 064f546..71e895d 100644 --- a/src/giql/__init__.py +++ b/src/giql/__init__.py @@ -1,13 +1,6 @@ """GIQL - Genomic Interval Query Language. A SQL dialect for genomic range queries. - -This package provides: - - GIQL dialect extending SQL with spatial operators (INTERSECTS, CONTAINS, WITHIN) - - CLUSTER and MERGE operations for interval grouping - - NEAREST operator for finding closest intervals - - Range parser for genomic coordinate strings - - Transpilation to standard SQL-92 compatible output """ from giql.table import Table From 9be3fdc37f655f74ee7c5393daacd453a024eeab Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:28:20 -0500 Subject: [PATCH 06/12] Add doc deps and use dependency-groups --- pyproject.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 766b52f..59d41a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ name = "giql" readme = "README.md" requires-python = ">=3.11" -[project.optional-dependencies] +[dependency-groups] dev = [ "duckdb>=1.4.0", "hypothesis>=6.0.0", @@ -37,6 +37,12 @@ dev = [ "pytest>=7.0.0", "ruff>=0.1.0", ] +docs = [ + "sphinx>=7.0", + "sphinx-autobuild>=2024.0", + "sphinx-book-theme>=1.1", + "sphinx-design>=0.6", +] [tool.hatch.metadata.hooks.custom] path = "build-hooks/metadata.py" From aff3b356f0b653ea89267fb0fe9f81925e73cab5 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:28:29 -0500 Subject: [PATCH 07/12] Simplify readme --- README.md | 140 +++++++++++++++--------------------------------------- 1 file changed, 38 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index 6b4d368..6fc0b1c 100644 --- a/README.md +++ b/README.md @@ -1,79 +1,34 @@ # GIQL - Genomic Interval Query Language -A SQL dialect for genomic range queries. Transpiles to standard SQL. + +

+ docs | + syntax | + transpiler +

+
+GIQL is an extended SQL dialect that allows you to declaratively express genomic interval operations. -## Overview +The `giql` Python package transpiles GIQL queries into standard SQL syntax for execution on any database or analytics engine. -GIQL extends SQL with spatial operators for genomic interval queries. It transpiles GIQL queries into standard SQL that can be executed on any database backend. - -GIQL provides a familiar SQL syntax for bioinformatics workflows, allowing you to express complex genomic range operations without writing intricate SQL expressions. Whether you're filtering variants by genomic region, finding overlapping features, or calculating distances between intervals, GIQL makes these operations intuitive and portable. - -## Features - -- **SQL-based**: Familiar SQL syntax with genomic extensions -- **Spatial operators**: INTERSECTS, CONTAINS, WITHIN for range relationships -- **Distance operators**: DISTANCE, NEAREST for proximity queries -- **Aggregation operators**: CLUSTER, MERGE for combining intervals -- **Set quantifiers**: ANY, ALL for multi-range queries -- **Transpilation**: Converts GIQL to standard SQL for execution on any backend +> **Note:** This project is in active development — APIs, syntax, and behavior may change. ## Installation -### From PyPI - -Install the latest stable release: +To install the transpiler: ```bash pip install giql ``` -Or the latest release candidate: - -```bash -pip install --pre giql -``` - -### From Source - -Clone the repository and install locally: - -```bash -# Clone the repository -git clone https://github.com/abdenlab/giql.git -cd giql - -# Install in development mode -pip install -e . - -# Or with development dependencies -pip install -e ".[dev]" -``` - -### Building Documentation - -To build the documentation locally: - -```bash -cd docs - -# Install documentation dependencies -pip install -r requirements.txt - -# Build HTML documentation -make html - -# View the documentation -# The built docs will be in docs/_build/html/ -# Open docs/_build/html/index.html in your browser -``` +## Usage (transpilation) -## Quick Start +The `giql` package transpiles GIQL queries to standard SQL. ```python from giql import transpile -# Transpile a GIQL query to standard SQL sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", tables=["peaks"], @@ -81,7 +36,7 @@ sql = transpile( print(sql) ``` -With custom column mappings: +Each table referenced in a GIQL query exposes a genomic "pseudo-column" that maps to separate logical chromosome, start, end, and strand columns. You can customize the column mappings. ```python from giql import Table, transpile @@ -98,9 +53,12 @@ sql = transpile( ) ], ) +print(sql) ``` -Execution example with DuckDB: +The transpiled SQL can be executed with fast genome-unaware databases or in-memory analytic engines like DuckDB. + +You can also use [oxbow](https://oxbow.readthedocs.io) to efficiently stream specialized genomics formats into DuckDB. ```python import duckdb @@ -108,57 +66,35 @@ import oxbow as ox from giql import transpile conn = duckdb.connect() -peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn) # streaming source + +# Load a streaming data source as a DuckDB relation +peaks = ox.from_bed("peaks.bed", bed_schema="bed6+4").to_duckdb(conn) sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", tables=["peaks"], ) + +# Execute and return the output as a dataframe df = con.execute(sql).fetchdf() ``` -## Operators at a Glance - -### Spatial Relationships - -| Operator | Description | -|----------|-------------| -| `INTERSECTS` | Returns true when ranges overlap by at least one base pair | -| `CONTAINS` | Returns true when one range fully contains another | -| `WITHIN` | Returns true when one range is fully within another | - -### Distance and Proximity - -| Operator | Description | -|----------|-------------| -| `DISTANCE` | Calculate genomic distance between two intervals | -| `NEAREST` | Find k-nearest genomic features | - -### Aggregation - -| Operator | Description | -|----------|-------------| -| `CLUSTER` | Assign cluster IDs to overlapping intervals | -| `MERGE` | Combine overlapping intervals into unified regions | - -### Set Quantifiers - -| Quantifier | Description | -|------------|-------------| -| `ANY` | Match if condition holds for any of the specified ranges | -| `ALL` | Match if condition holds for all of the specified ranges | - -## Documentation - -For complete documentation, build the docs locally (see above) or visit the hosted documentation. +## Development -The documentation includes: +```bash +git clone https://github.com/abdenlab/giql.git +cd giql +uv sync +``` -- **Operator Reference**: Detailed documentation for each operator with examples -- **Recipes**: Common query patterns for intersections, distance calculations, and clustering -- **Bedtools Migration Guide**: How to replicate bedtools operations with GIQL -- **Guides**: Performance optimization, multi-backend configuration, and schema mapping +To build the documentation locally: -## Development +```bash +uv run --group docs sphinx-build docs docs/_build +# The built docs will be in docs/_build/html/ +``` -This project is in active development. +For serve the docs locally with automatic rebuild: +```bash +uv run --group docs sphinx-autobuild docs docs/_build +``` From f3196389c287e40a71a812418eaf9b4c78e5fa4e Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:37:26 -0500 Subject: [PATCH 08/12] Update readthedocs.yaml --- .readthedocs.yaml | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 80eac40..a2caa3d 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,30 +6,20 @@ version: 2 # Set the OS, Python version and other tools build: - os: "ubuntu-22.04" + os: "ubuntu-24.04" tools: python: "3.11" + jobs: + post_install: + - python -m pip install --group docs # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py -formats: - - epub - -# Specify the Python requirements file +# Install the project python: install: - - requirements: docs/requirements.txt - method: pip path: . -# Version management: -# Documentation is immutable per release. Each git tag (v0.1.0, v0.2.0, etc.) produces -# a frozen snapshot of the docs as they were at that release. -# -# - Tags (v0.1.0, v0.2.0, ...): Immutable doc snapshots -# - main/master branches: Development version docs (always latest) -# -# This ensures historical accuracy: users viewing old version docs see them exactly -# as they were released, not retrospectively updated. From d1ae866fb92d372b117d6d5ffca1be16e66ae657 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 01:52:04 -0500 Subject: [PATCH 09/12] Update docs --- README.md | 4 ++-- docs/index.rst | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6fc0b1c..7c9c95a 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@

docs | - syntax | - transpiler + syntax | + transpiler

diff --git a/docs/index.rst b/docs/index.rst index b595529..417faad 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,16 @@ Genomic Interval Query Language (GIQL) ====================================== +.. toctree:: + :hidden: + + Home + guides/quickstart + **GIQL** is an extended SQL dialect that allows you to declaratively express genomic interval operations. +See the :doc:`guides/quickstart` to get started. + Dialect ------- GIQL extends the SQL query language with dedicated constructs for these @@ -40,7 +48,6 @@ See the following guides to learn how to use GIQL effectively: :maxdepth: 1 :caption: Guides and Recipes - guides/quickstart guides/index recipes/index From 4d856625a01398a9448a5ca03228d1f828803078 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 10 Feb 2026 02:00:52 -0500 Subject: [PATCH 10/12] Update readme --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c9c95a..b51586d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# GIQL - Genomic Interval Query Language +# GIQL + +

Genomic Interval Query Language (GIQL)

+

/JEE-quel/

@@ -36,6 +39,16 @@ sql = transpile( print(sql) ``` +```sql +SELECT + * +FROM peaks +WHERE + ( + "chrom" = 'chr1' AND "start" < 2000 AND "end" > 1000 + ) +``` + Each table referenced in a GIQL query exposes a genomic "pseudo-column" that maps to separate logical chromosome, start, end, and strand columns. You can customize the column mappings. ```python From ceb9cf402da6b07d83d22fe24f5e9ff55910c16d Mon Sep 17 00:00:00 2001 From: Conrad Date: Tue, 10 Feb 2026 16:07:02 -0500 Subject: [PATCH 11/12] Drop engine guide from docs --- docs/guides/engine.rst | 195 ----------------------------------------- 1 file changed, 195 deletions(-) delete mode 100644 docs/guides/engine.rst diff --git a/docs/guides/engine.rst b/docs/guides/engine.rst deleted file mode 100644 index 71269be..0000000 --- a/docs/guides/engine.rst +++ /dev/null @@ -1,195 +0,0 @@ -Execution engines -================= - -GIQL transpiles genomic queries to SQL that can be executed on any database -backend. This guide covers backend-specific considerations and tips. - -.. contents:: - :local: - :depth: 1 - -Supported Backends ------------------- - -GIQL generates SQL that works across database systems: - -.. list-table:: - :header-rows: 1 - :widths: 20 20 60 - - * - Backend - - Status - - Best For - * - DuckDB - - Full Support - - Analytics, large datasets, in-memory processing - * - SQLite - - Full Support - - Lightweight, embedded, portable databases - * - PostgreSQL - - Planned - - Production deployments, shared databases - -Using with DuckDB ------------------ - -DuckDB is recommended for most genomic analysis use cases. It provides excellent -performance for analytical queries and handles large genomic datasets efficiently. - -.. code-block:: python - - import duckdb - from giql import transpile - - sql = transpile( - """ - SELECT * FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """, - tables=["features"], - ) - - conn = duckdb.connect() - conn.execute("CREATE TABLE features AS SELECT * FROM read_csv('features.bed', delim='\t')") - result = conn.execute(sql).fetchdf() - -**Advantages:** - -- Fast analytical query performance -- Efficient columnar storage -- Good support for large datasets -- Rich SQL feature set -- In-memory and persistent options - -Using with SQLite ------------------ - -SQLite is a lightweight, embedded database suitable for smaller datasets or -when portability is important. - -.. code-block:: python - - import sqlite3 - from giql import transpile - - sql = transpile( - """ - SELECT * FROM features - WHERE interval INTERSECTS 'chr1:1000-2000' - """, - tables=["features"], - ) - - conn = sqlite3.connect("data.db") - cursor = conn.execute(sql) - for row in cursor: - print(row) - -**Advantages:** - -- Zero configuration -- Single-file database -- Widely compatible -- Small memory footprint - -Writing Portable Queries ------------------------- - -Query Compatibility -~~~~~~~~~~~~~~~~~~~ - -GIQL queries are portable across backends. The same GIQL query produces SQL -that works on any supported database: - -.. code-block:: python - - from giql import transpile - - query = """ - SELECT a.*, b.name AS gene - FROM variants a - JOIN genes b ON a.interval INTERSECTS b.interval - WHERE a.quality >= 30 - """ - - # Same GIQL query works for any backend - sql = transpile(query, tables=["variants", "genes"]) - -Backend-Specific Features -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some SQL features may only be available on certain backends: - -.. list-table:: - :header-rows: 1 - :widths: 40 20 20 20 - - * - Feature - - DuckDB - - SQLite - - Notes - * - Window functions - - Yes - - Yes - - Full support - * - CTEs (WITH clause) - - Yes - - Yes - - Full support - * - LATERAL joins - - Yes - - Limited - - Used by NEAREST - * - STRING_AGG - - Yes - - GROUP_CONCAT - - Different function names - -Performance Comparison ----------------------- - -Backend Performance Characteristics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 30 35 35 - - * - Operation - - DuckDB - - SQLite - * - Large table scans - - Excellent (columnar) - - Good - * - Complex joins - - Excellent - - Good - * - Aggregations - - Excellent - - Good - * - Small queries - - Good - - Excellent - * - Memory usage - - Higher - - Lower - * - Startup time - - Faster - - Fast - -Choosing the Right Backend -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Choose DuckDB when:** - -- Working with large datasets (millions of features) -- Running complex analytical queries -- Performing heavy aggregations -- Memory is not constrained - -**Choose SQLite when:** - -- Working with smaller datasets -- Need maximum portability -- Memory is constrained -- Simple query patterns From 0deabf27f1b31932e2874ea73e7b724c28055839 Mon Sep 17 00:00:00 2001 From: Conrad Date: Fri, 20 Feb 2026 11:01:57 -0500 Subject: [PATCH 12/12] Create issue templates for more structured collaboration --- .github/ISSUE_TEMPLATE/bug_report.yaml | 25 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/build.yaml | 25 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/cicd.yaml | 25 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/feature_request.yaml | 25 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/refactor.yaml | 25 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/test.yaml | 25 +++++++++++++++++++++ 7 files changed, 151 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml create mode 100644 .github/ISSUE_TEMPLATE/build.yaml create mode 100644 .github/ISSUE_TEMPLATE/cicd.yaml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yaml create mode 100644 .github/ISSUE_TEMPLATE/refactor.yaml create mode 100644 .github/ISSUE_TEMPLATE/test.yaml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 0000000..99dea36 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,25 @@ +name: Bug Report +description: Report something that isn't working correctly. +labels: ["bug"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What is the bug? Include steps to reproduce if applicable. + validations: + required: true + - type: textarea + id: root-cause + attributes: + label: Root cause + description: What is causing the bug? Include relevant code snippets. + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which files, modules, or components are affected? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/build.yaml b/.github/ISSUE_TEMPLATE/build.yaml new file mode 100644 index 0000000..6b6490a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/build.yaml @@ -0,0 +1,25 @@ +name: Build +description: Propose a build system or dependency change. +labels: ["build"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What build system or dependency change is needed? + validations: + required: true + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why is this change needed? + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which build files, configs, or dependencies are affected? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/cicd.yaml b/.github/ISSUE_TEMPLATE/cicd.yaml new file mode 100644 index 0000000..277c550 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/cicd.yaml @@ -0,0 +1,25 @@ +name: CI/CD +description: Propose a CI/CD pipeline change. +labels: ["cicd"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What CI/CD change is needed? + validations: + required: true + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why is this change needed? What does it improve? + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which workflows, pipelines, or config files are affected? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..3ba13e0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml new file mode 100644 index 0000000..d444640 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -0,0 +1,25 @@ +name: Feature Request +description: Propose a new feature or capability. +labels: ["feature"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What is the feature? Describe the desired behavior. + validations: + required: true + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why is this feature needed? What problem does it solve? + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which files, modules, or components would be affected? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/refactor.yaml b/.github/ISSUE_TEMPLATE/refactor.yaml new file mode 100644 index 0000000..3ab2bfc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/refactor.yaml @@ -0,0 +1,25 @@ +name: Refactor +description: Propose a code restructuring without behavior change. +labels: ["refactor"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What should be restructured and what does the end state look like? + validations: + required: true + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why is this restructuring needed? + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which files, modules, or components are affected? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/test.yaml b/.github/ISSUE_TEMPLATE/test.yaml new file mode 100644 index 0000000..c417347 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/test.yaml @@ -0,0 +1,25 @@ +name: Test +description: Add or improve test coverage or test infrastructure. +labels: ["test"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What needs to be tested or what test infrastructure is needed? + validations: + required: true + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why is this test work needed? What gap does it fill? + validations: + required: true + - type: textarea + id: affected-code + attributes: + label: Affected code + description: Which files, modules, or components are affected? + validations: + required: false