From 294dff50c990667e1c0d57e5fff09c131fd0fd63 Mon Sep 17 00:00:00 2001 From: buildvoc10 <76884997+buildvoc10@users.noreply.github.com> Date: Fri, 19 Dec 2025 17:46:29 +0000 Subject: [PATCH 1/4] Stabilize docling graph builder and UI --- apps/docling_graph/assets/theme.css | 58 +- apps/docling_graph/assets/view_options.css | 309 ++---- apps/docling_graph/graph_builder.py | 386 +++++-- apps/docling_graph/graph_styles.py | 166 +++ apps/docling_graph/main.py | 1071 +++++++++++++------- 5 files changed, 1306 insertions(+), 684 deletions(-) create mode 100644 apps/docling_graph/graph_styles.py diff --git a/apps/docling_graph/assets/theme.css b/apps/docling_graph/assets/theme.css index 157d9dc..2ff3f2f 100644 --- a/apps/docling_graph/assets/theme.css +++ b/apps/docling_graph/assets/theme.css @@ -1,38 +1,30 @@ -html, body { - background: #121212; - color: #eaeaea; -} - -.app-root { - display: flex; - background: #121212; -} - -.sidebar { - width: 340px; - background: #1b1b1b; - padding: 12px; - border-right: 1px solid #333; -} - -label { - color: #ccc; -} - -.Select-control, -.Select-menu-outer { - background: #222; - color: #fff; +:root { + --gc-bg: #0b0f17; + --gc-panel-bg: #0f172a; + --gc-panel-border: #111827; + --gc-text: #e5e7eb; + --gc-muted: #94a3b8; + --gc-accent: #7c3aed; + --gc-graph-bg: #0b0f17; + --gc-button-bg: #111827; + --gc-button-border: #1f2937; } -.rc-slider-track { - background-color: #9b4dff; +@media (prefers-color-scheme: light) { + :root { + --gc-bg: #f8fafc; + --gc-panel-bg: #ffffff; + --gc-panel-border: #e2e8f0; + --gc-text: #0f172a; + --gc-muted: #475569; + --gc-accent: #7c3aed; + --gc-graph-bg: #ffffff; + --gc-button-bg: #f1f5f9; + --gc-button-border: #cbd5f5; + } } -.rc-slider-handle { - border-color: #9b4dff; -} - -.rc-slider-rail { - background-color: #333; +html, body { + background: var(--gc-bg); + color: var(--gc-text); } diff --git a/apps/docling_graph/assets/view_options.css b/apps/docling_graph/assets/view_options.css index d068d5f..3bec418 100644 --- a/apps/docling_graph/assets/view_options.css +++ b/apps/docling_graph/assets/view_options.css @@ -1,83 +1,73 @@ -/* view_options.css - Adds "skeleton-lite" grid to match dash-cytoscape demo layout - while keeping your dark GraphCommons-style panel. -*/ - -/* --------------------------- - Skeleton-lite grid (demo compatibility) - The demo uses:
/ "four columns" / "row" -----------------------------*/ -.row { - display: flex; - flex-wrap: wrap; - align-items: stretch; - width: 100%; - gap: 0; +/* Layout */ +.docling-app { + background: var(--gc-bg); + color: var(--gc-text); + min-height: 100vh; + padding: 12px; } -.columns { - box-sizing: border-box; - padding: 0; - min-width: 0; +.app-grid { + display: grid; + grid-template-columns: 320px 1fr 340px; + gap: 12px; } -/* 12-col grid: 8/12 and 4/12 */ -.eight.columns { flex: 0 0 66.6666%; max-width: 66.6666%; } -.four.columns { flex: 0 0 33.3333%; max-width: 33.3333%; } - -/* Responsive: stack on small screens */ -@media (max-width: 980px) { - .eight.columns, .four.columns { flex: 0 0 100%; max-width: 100%; } +@media (max-width: 1200px) { + .app-grid { + grid-template-columns: 1fr; + } } -/* Add spacing so the right panel isn’t flush */ -.eight.columns { padding-right: 12px; } -.four.columns { padding-left: 12px; } +.panel-left, +.panel-right { + background: var(--gc-panel-bg); + border: 1px solid var(--gc-panel-border); + border-radius: 12px; + padding: 14px; + box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2); +} -/* --------------------------- - Your existing dark UI -----------------------------*/ -.app { - background: #0b0f17; - color: #e5e7eb; - min-height: 100vh; - padding: 10px; +.panel-main { + background: var(--gc-panel-bg); + border: 1px solid var(--gc-panel-border); + border-radius: 12px; + padding: 8px; } -.topbar { - display: flex; - justify-content: space-between; - align-items: center; +.panel-title { + font-size: 16px; + font-weight: 700; margin-bottom: 10px; } -.title { - font-size: 22px; - font-weight: 700; +.control-section { + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid var(--gc-panel-border); } -.controls { - display: flex; - gap: 14px; - flex-wrap: wrap; - align-items: center; - margin-bottom: 10px; +.control-label { + font-size: 12px; + letter-spacing: 0.3px; + text-transform: uppercase; + color: var(--gc-muted); + margin-bottom: 6px; } .btn { - background: #111827; - color: #e5e7eb; - border: 1px solid #1f2937; + background: var(--gc-button-bg); + color: var(--gc-text); + border: 1px solid var(--gc-button-border); padding: 8px 10px; border-radius: 10px; cursor: pointer; + margin-right: 6px; + margin-top: 6px; } -.btn:hover { border-color: #334155; } - .btn-primary { - background: #7c3aed; /* purple accent to match demo vibe */ - color: white; + background: var(--gc-accent); + color: #fff; border: 1px solid #6d28d9; padding: 10px 12px; border-radius: 10px; @@ -85,204 +75,113 @@ width: 100%; } -.btn-close { - background: transparent; - border: 1px solid #1f2937; - color: #e5e7eb; - border-radius: 10px; - padding: 4px 10px; - cursor: pointer; -} - .btn-small { - background: #111827; - color: #e5e7eb; - border: 1px solid #1f2937; + background: var(--gc-button-bg); + color: var(--gc-text); + border: 1px solid var(--gc-button-border); padding: 4px 10px; border-radius: 8px; cursor: pointer; - margin-left: 6px; -} - -.panel { - position: fixed; - top: 0; - right: 0; - width: 360px; - height: 100vh; - background: #0f172a; - border-left: 1px solid #1f2937; - box-shadow: -10px 0 40px rgba(0,0,0,0.35); - padding: 14px 14px 18px; - z-index: 9999; - overflow-y: auto; + margin-top: 6px; } -.hidden { display: none; } - -.panel-header { +.inspector-panel { display: flex; - justify-content: space-between; - align-items: center; - margin-bottom: 10px; + flex-direction: column; + gap: 10px; } -.panel-title { +.inspector-title { font-size: 18px; font-weight: 700; } -.panel-section { - border-top: 1px solid #1f2937; - padding-top: 10px; - margin-top: 10px; -} - -.section-title { - font-size: 12px; - opacity: 0.85; - margin-bottom: 6px; - font-weight: 600; -} - -.muted { - opacity: 0.85; +.inspector-subtitle { font-size: 12px; + text-transform: uppercase; + color: var(--gc-muted); } -.panel-footer { - margin-top: 14px; - border-top: 1px solid #1f2937; - padding-top: 12px; -} - -/* --------------------------- - Control panel (dark, inverted demo) -----------------------------*/ -.control-tabs { - background: #0b0f17; - border-radius: 12px; - box-shadow: inset 0 0 0 1px #111827, 0 10px 30px rgba(0, 0, 0, 0.35); - overflow: hidden; -} - -.control-tab { - background: #0b0f17; - color: #cbd5e1; - border: none; - padding: 0; -} - -.control-tab--selected { - background: #0f172a !important; - border-bottom: 2px solid #7c3aed !important; - color: #e5e7eb !important; -} - -.control-panel { - background: linear-gradient(135deg, #0f172a, #0b0f17); - border: 1px solid #111827; - border-radius: 12px; - padding: 14px 14px 18px; - color: #e5e7eb; - min-height: 80vh; - box-shadow: 0 14px 30px rgba(0, 0, 0, 0.45); -} - -.control-panel--secondary { - min-height: unset; -} - -.control-panel__header { - display: flex; - align-items: center; - justify-content: space-between; - margin-bottom: 10px; +.inspector-description { + font-size: 13px; + color: var(--gc-text); + white-space: pre-wrap; } -.control-title { - font-size: 15px; - font-weight: 700; +.inspector-group { + border-top: 1px solid var(--gc-panel-border); + padding-top: 8px; } -.pill { - border-radius: 999px; - padding: 3px 10px; - font-size: 11px; - letter-spacing: 0.3px; - text-transform: uppercase; - background: #111827; - border: 1px solid #1f2937; - color: #cbd5e1; +.inspector-group-title { + font-size: 12px; + color: var(--gc-muted); + margin-bottom: 4px; } -.pill--invert { - background: #c7d2fe; - color: #0b0f17; - border-color: #a5b4fc; +.inspector-item { + font-size: 12px; + padding: 2px 0; } -.control-section { +.debug-panel { margin-top: 12px; - padding-top: 12px; - border-top: 1px solid #111827; -} - -.control-section--tight { - padding-top: 6px; + background: var(--gc-panel-bg); + border: 1px solid var(--gc-panel-border); + border-radius: 12px; + padding: 12px; } -.control-label { - font-size: 12px; - letter-spacing: 0.3px; - text-transform: uppercase; - color: #94a3b8; - margin-bottom: 6px; +.debug-output { + max-height: 200px; + overflow-y: auto; + background: rgba(15, 23, 42, 0.4); + padding: 8px; + border-radius: 8px; + color: var(--gc-text); } -.control-subtext { - margin-top: 8px; +.muted { + color: var(--gc-muted); font-size: 12px; - color: #cbd5e1; - opacity: 0.85; } .Select-control, .Select-menu-outer { - background: #0f172a; - border-color: #1f2937; - color: #e5e7eb; + background: var(--gc-panel-bg); + border-color: var(--gc-panel-border); + color: var(--gc-text); } -.control-panel .Select-value-label { - color: #e5e7eb !important; +.control-panel .Select-value-label, +.panel-left .Select-value-label { + color: var(--gc-text) !important; } -.control-panel .rc-slider-track { - background-color: #7c3aed; +.control-panel .rc-slider-track, +.panel-left .rc-slider-track { + background-color: var(--gc-accent); } -.control-panel .rc-slider-handle { - border-color: #7c3aed; - background-color: #0f172a; +.control-panel .rc-slider-handle, +.panel-left .rc-slider-handle { + border-color: var(--gc-accent); + background-color: var(--gc-panel-bg); box-shadow: 0 0 0 2px rgba(124, 58, 237, 0.2); } -.control-panel .rc-slider-rail { - background-color: #1f2937; -} - -.control-panel .rc-slider-mark-text { - color: #94a3b8; +.control-panel .rc-slider-rail, +.panel-left .rc-slider-rail { + background-color: var(--gc-panel-border); } .control-radio, .control-checkbox { - accent-color: #7c3aed; + accent-color: var(--gc-accent); margin-right: 8px; } .control-radio__label, .control-checkbox__label { - color: #e5e7eb; + color: var(--gc-text); } diff --git a/apps/docling_graph/graph_builder.py b/apps/docling_graph/graph_builder.py index 502eff0..b60d7d8 100644 --- a/apps/docling_graph/graph_builder.py +++ b/apps/docling_graph/graph_builder.py @@ -1,14 +1,14 @@ from __future__ import annotations -import hashlib import json +import math import os import tempfile import unittest -from collections.abc import Iterable +from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple DOCLING_JSON_ROOT = "/home/hp/docling-ws/data/docling" @@ -28,6 +28,16 @@ "decorative", } +EDGE_CONTAINS = "CONTAINS" +EDGE_HAS_PAGE = "HAS_PAGE" +EDGE_HAS_BODY = "HAS_BODY" +EDGE_NEXT = "NEXT" +EDGE_ON_PAGE = "ON_PAGE" + +NODE_DOCUMENT = "Document" +NODE_PAGE = "Page" +NODE_TEXT = "Text" + # ----------------------------- # Graph contract @@ -41,8 +51,19 @@ class GraphPayload: # ----------------------------- # Helpers # ----------------------------- -def _nid(value: str) -> str: - return "n_" + hashlib.md5(value.encode("utf-8")).hexdigest()[:12] + +def _node_id(node_type: str, name: str) -> str: + return f"{node_type}::{name}" + + +def _edge_id( + from_type: str, + from_name: str, + edge_type: str, + to_type: str, + to_name: str, +) -> str: + return f"{from_type}::{from_name}::{edge_type}::{to_type}::{to_name}" def _load_json(path: str) -> Dict[str, Any]: @@ -64,17 +85,6 @@ def _short(text: str, limit: int = 120) -> str: return t[:limit] + ("…" if len(t) > limit else "") -def list_docling_files(json_root: Optional[str] = None) -> List[str]: - results: List[str] = [] - search_root = json_root or DOCLING_JSON_ROOT - - for root, _, files in os.walk(search_root): - for name in files: - if name.lower().endswith(".json"): - results.append(os.path.join(root, name)) - return sorted(results) - - def _is_noise_text(label: str, text: str) -> bool: lbl = (label or "").strip().lower() t = (text or "").strip() @@ -88,9 +98,91 @@ def _is_noise_text(label: str, text: str) -> bool: return False +def _clamp(value: float, min_value: float, max_value: float) -> float: + return max(min_value, min(value, max_value)) + + +def _resolve_pointer(doc: Any, pointer: str) -> Any: + current = doc + for part in pointer.split("/"): + part = part.replace("~1", "/").replace("~0", "~") + if isinstance(current, list): + try: + idx = int(part) + except ValueError: + return None + if idx >= len(current): + return None + current = current[idx] + elif isinstance(current, dict): + if part not in current: + return None + current = current[part] + else: + return None + return current + + +def resolve_refs(obj: Any, root: Any, seen: Optional[Dict[int, Any]] = None) -> Any: + if seen is None: + seen = {} + + obj_id = id(obj) + if obj_id in seen: + return seen[obj_id] + + if isinstance(obj, dict): + if "$ref" in obj and isinstance(obj["$ref"], str): + ref = obj["$ref"] + if ref.startswith("#/"): + resolved = _resolve_pointer(root, ref[2:]) + if resolved is not None: + resolved_value = resolve_refs(resolved, root, seen) + if len(obj) == 1: + return resolved_value + merged = { + **(resolved_value if isinstance(resolved_value, dict) else {}), + **{k: v for k, v in obj.items() if k != "$ref"}, + } + return resolve_refs(merged, root, seen) + resolved_dict = {} + seen[obj_id] = resolved_dict + for key, value in obj.items(): + resolved_dict[key] = resolve_refs(value, root, seen) + return resolved_dict + + if isinstance(obj, list): + resolved_list: List[Any] = [] + seen[obj_id] = resolved_list + for item in obj: + resolved_list.append(resolve_refs(item, root, seen)) + return resolved_list + + return obj + + +def list_docling_files(json_root: Optional[str] = None) -> List[str]: + results: List[str] = [] + search_root = json_root or DOCLING_JSON_ROOT + + for root, _, files in os.walk(search_root): + for name in files: + if name.lower().endswith(".json"): + results.append(os.path.join(root, name)) + return sorted(results) + + # ----------------------------- # Core builder # ----------------------------- + +def _collect_text_items(doc: Dict[str, Any]) -> List[Dict[str, Any]]: + texts = doc.get("texts") + if isinstance(texts, list): + return [t for t in texts if isinstance(t, dict)] + return [] + + def _collect_pages_from_texts(texts: Iterable[Any]) -> Dict[int, List[Dict[str, Any]]]: pages: Dict[int, List[Dict[str, Any]]] = {} @@ -116,110 +208,187 @@ def _collect_pages_from_texts(texts: Iterable[Any]) -> Dict[int, List[Dict[str, return pages -def build_graph_from_docling_json(path: str) -> GraphPayload: - """ - Graph shape: - DOCUMENT → PAGE → TEXT +def _bucket_text_length(text: str) -> int: + if not text: + return 1 + return int(_clamp(math.ceil(len(text) / 200), 1, 10)) + + +def _compute_node_weights(nodes: List[Dict[str, Any]], edges: List[Dict[str, Any]]) -> None: + outgoing = defaultdict(list) + incoming = defaultdict(list) + + for edge in edges: + data = edge.get("data", {}) + source = data.get("source") + target = data.get("target") + if source: + outgoing[source].append(edge) + if target: + incoming[target].append(edge) + + for node in nodes: + data = node.get("data", {}) + node_id = data.get("id") + description = data.get("description", "") or "" + text_score = min(10, math.ceil(len(description) / 200)) if description else 0 + children_score = len(outgoing.get(node_id, [])) + degree_score = len(outgoing.get(node_id, [])) + len(incoming.get(node_id, [])) + weight = 1 + children_score + text_score + (degree_score * 0.5) + data["weight"] = round(weight, 2) + data["size"] = round(_clamp(20 + weight * 3, 20, 80), 2) + + +def _compute_edge_weights(edges: List[Dict[str, Any]], node_lookup: Dict[str, Dict[str, Any]]) -> None: + for edge in edges: + data = edge.get("data", {}) + edge_type = data.get("type") + weight = 1 + if edge_type == EDGE_CONTAINS: + target = node_lookup.get(data.get("target"), {}) + description = target.get("data", {}).get("description", "") + weight = _bucket_text_length(description) + data["weight"] = weight + data["width"] = round(_clamp(1 + weight * 0.7, 1, 8), 2) + - Nodes and edges are returned separately - for Cytoscape stability and incremental expansion. - """ +def build_graph_from_docling_json(path: str) -> GraphPayload: doc = _load_json(path) + resolved_doc = resolve_refs(doc, doc) - nodes: List[Dict[str, Any]] = [] - edges: List[Dict[str, Any]] = [] + nodes_by_id: Dict[str, Dict[str, Any]] = {} + edges_by_id: Dict[str, Dict[str, Any]] = {} - doc_id = _nid(path) doc_name = os.path.basename(path) + doc_node_id = _node_id(NODE_DOCUMENT, doc_name) + nodes_by_id[doc_node_id] = { + "data": { + "id": doc_node_id, + "label": doc_name, + "type": NODE_DOCUMENT, + "description": resolved_doc.get("title") or doc_name, + "weight": 1, + "size": 20, + }, + "classes": "node-type-document", + } + + texts = _collect_text_items(resolved_doc) + pages = _collect_pages_from_texts(texts) + + page_ids: Dict[int, str] = {} - # DOCUMENT node - nodes.append( - { + for page_no in sorted(pages.keys()): + page_name = f"Page {page_no}" + page_id = _node_id(NODE_PAGE, page_name) + page_ids[page_no] = page_id + nodes_by_id[page_id] = { "data": { - "id": doc_id, - "type": "document", - "label_short": f"DOCUMENT: {doc_name}", - "label_full": f"DOCUMENT: {doc_name}", - "label": f"DOCUMENT: {doc_name}", + "id": page_id, + "label": page_name, + "type": NODE_PAGE, + "description": f"Page {page_no} of {doc_name}", + "weight": 1, + "size": 20, + "page": page_no, }, - "classes": "document", + "classes": "node-type-page", } - ) - pages = _collect_pages_from_texts(doc.get("texts")) + edge_id = _edge_id(NODE_DOCUMENT, doc_name, EDGE_HAS_PAGE, NODE_PAGE, page_name) + edges_by_id[edge_id] = { + "data": { + "id": edge_id, + "source": doc_node_id, + "target": page_id, + "type": EDGE_HAS_PAGE, + }, + "classes": "edge-type-has-page", + } + + sorted_pages = sorted(page_ids.items()) + for index, (page_no, page_id) in enumerate(sorted_pages[:-1]): + next_page_no, next_page_id = sorted_pages[index + 1] + from_name = f"Page {page_no}" + to_name = f"Page {next_page_no}" + edge_id = _edge_id(NODE_PAGE, from_name, EDGE_NEXT, NODE_PAGE, to_name) + edges_by_id[edge_id] = { + "data": { + "id": edge_id, + "source": page_id, + "target": next_page_id, + "type": EDGE_NEXT, + }, + "classes": "edge-type-next", + } - # PAGE + TEXT nodes for page_no in sorted(pages.keys()): - page_id = _nid(f"{path}::page::{page_no}") + page_id = page_ids[page_no] + page_name = f"Page {page_no}" + page_texts = pages[page_no][:MAX_TEXTS_PER_PAGE] + + for idx, t in enumerate(page_texts, start=1): + raw_text = str(t.get("text") or "") + label = str(t.get("label") or "text").strip() or "text" + name = f"p{page_no}-{idx}: {_short(raw_text, 80)}" + text_id = _node_id(NODE_TEXT, name) - nodes.append( - { + nodes_by_id[text_id] = { "data": { - "id": page_id, - "type": "chunk", + "id": text_id, + "label": name, + "type": NODE_TEXT, + "description": raw_text, + "weight": 1, + "size": 20, "page": page_no, - "label_short": f"PAGE {page_no}", - "label_full": f"PAGE {page_no}", - "label": f"PAGE {page_no}", + "label_type": label, }, - "classes": "section", + "classes": "node-type-text", } - ) - edges.append( - { + contains_id = _edge_id(NODE_PAGE, page_name, EDGE_CONTAINS, NODE_TEXT, name) + edges_by_id[contains_id] = { "data": { - "id": _nid(f"{doc_id}__{page_id}"), - "source": doc_id, - "target": page_id, - "rel": "hier", + "id": contains_id, + "source": page_id, + "target": text_id, + "type": EDGE_CONTAINS, }, - "classes": "hier", + "classes": "edge-type-contains", } - ) - - page_texts = pages[page_no][:MAX_TEXTS_PER_PAGE] - for t in page_texts: - ref = t.get("self_ref") or t.get("id") or repr(t) - text_id = _nid(ref) + on_page_id = _edge_id(NODE_TEXT, name, EDGE_ON_PAGE, NODE_PAGE, page_name) + edges_by_id[on_page_id] = { + "data": { + "id": on_page_id, + "source": text_id, + "target": page_id, + "type": EDGE_ON_PAGE, + }, + "classes": "edge-type-on-page", + } - raw_text = str(t.get("text") or "") - dtype = str(t.get("label") or "text").upper() - content_layer = t.get("content_layer") - _, bbox = _first_prov(t) + has_body_id = _edge_id(NODE_DOCUMENT, doc_name, EDGE_HAS_BODY, NODE_TEXT, name) + edges_by_id[has_body_id] = { + "data": { + "id": has_body_id, + "source": doc_node_id, + "target": text_id, + "type": EDGE_HAS_BODY, + }, + "classes": "edge-type-has-body", + } - label_full = f"{dtype}: {raw_text}" - label_short = f"{dtype}: {_short(raw_text, 140)}" + nodes = list(nodes_by_id.values()) + edges = list(edges_by_id.values()) + node_lookup = {node["data"]["id"]: node for node in nodes} - nodes.append( - { - "data": { - "id": text_id, - "type": "text", - "content_layer": content_layer, - "text": raw_text, - "page": page_no, - "bbox": bbox, - "label_short": label_short, - "label_full": label_full, - "label": label_full, - }, - "classes": "item", - } - ) + _compute_edge_weights(edges, node_lookup) + _compute_node_weights(nodes, edges) - edges.append( - { - "data": { - "id": _nid(f"{page_id}__{text_id}"), - "source": page_id, - "target": text_id, - "rel": "hier", - }, - "classes": "hier", - } - ) + nodes = sorted(nodes, key=lambda n: n.get("data", {}).get("id", "")) + edges = sorted(edges, key=lambda e: e.get("data", {}).get("id", "")) return GraphPayload(nodes=nodes, edges=edges) @@ -252,13 +421,14 @@ def test_collect_pages_from_texts_filters_invalid_entries(self): def test_build_graph_from_docling_json_emits_document_page_and_text(self): doc = { + "title": "Example", "texts": [ { "label": "body", "text": "paragraph " * 10, "prov": [{"page_no": 1, "bbox": [0, 0, 10, 10]}], } - ] + ], } with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp: @@ -270,19 +440,16 @@ def test_build_graph_from_docling_json_emits_document_page_and_text(self): finally: Path(tmp_path).unlink(missing_ok=True) - self.assertEqual(len(payload.nodes), 3) - self.assertEqual(len(payload.edges), 2) - - node_ids = {n["data"]["id"] for n in payload.nodes} - edge_pairs = {(e["data"]["source"], e["data"]["target"]) for e in payload.edges} + node_types = {n["data"]["type"] for n in payload.nodes} + edge_types = {e["data"]["type"] for e in payload.edges} - doc_id = _nid(tmp_path) - page_id = _nid(f"{tmp_path}::page::1") - text_id = next(n["data"]["id"] for n in payload.nodes if n["data"]["type"] == "text") - - self.assertIn(doc_id, node_ids) - self.assertIn(page_id, node_ids) - self.assertEqual(edge_pairs, {(doc_id, page_id), (page_id, text_id)}) + self.assertIn(NODE_DOCUMENT, node_types) + self.assertIn(NODE_PAGE, node_types) + self.assertIn(NODE_TEXT, node_types) + self.assertIn(EDGE_CONTAINS, edge_types) + self.assertIn(EDGE_HAS_PAGE, edge_types) + self.assertIn(EDGE_HAS_BODY, edge_types) + self.assertIn(EDGE_ON_PAGE, edge_types) def test_list_docling_files_accepts_custom_root(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -298,6 +465,11 @@ def test_list_docling_files_accepts_custom_root(self): self.assertEqual(results, sorted([str(first), str(second)])) + def test_resolve_refs_inlines_json_pointer(self): + doc = {"texts": [{"label": "body", "text": "abc"}], "ref": {"$ref": "#/texts/0"}} + resolved = resolve_refs(doc, doc) + self.assertEqual(resolved["ref"], {"label": "body", "text": "abc"}) + if __name__ == "__main__": unittest.main() diff --git a/apps/docling_graph/graph_styles.py b/apps/docling_graph/graph_styles.py new file mode 100644 index 0000000..e079101 --- /dev/null +++ b/apps/docling_graph/graph_styles.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import hashlib +from typing import Any, Dict, Iterable, List, Tuple + + +NODE_PALETTE = [ + "#38BDF8", + "#A78BFA", + "#F472B6", + "#34D399", + "#FBBF24", + "#60A5FA", + "#FB7185", + "#4ADE80", + "#F59E0B", + "#22D3EE", +] + +EDGE_PALETTE = [ + "#94A3B8", + "#A78BFA", + "#F472B6", + "#34D399", + "#FBBF24", + "#60A5FA", +] + +THEMES = { + "dark": { + "bg": "#0B0F17", + "panel": "#0F172A", + "text": "#E5E7EB", + "muted": "#94A3B8", + "outline": "#0B1220", + "edge": "#64748B", + "edge_label": "#CBD5E1", + "selection": "#FBBF24", + "dim": 0.15, + }, + "light": { + "bg": "#F8FAFC", + "panel": "#FFFFFF", + "text": "#0F172A", + "muted": "#475569", + "outline": "#E2E8F0", + "edge": "#64748B", + "edge_label": "#334155", + "selection": "#F59E0B", + "dim": 0.2, + }, +} + + +def _slug(value: str) -> str: + return "".join(ch if ch.isalnum() else "-" for ch in value.strip().lower()).strip("-") + + +def _color_for_type(type_name: str, palette: List[str]) -> str: + digest = hashlib.md5(type_name.encode("utf-8")).hexdigest() + index = int(digest[:8], 16) % len(palette) + return palette[index] + + +def apply_theme_to_elements( + nodes: Iterable[Dict[str, Any]], + edges: Iterable[Dict[str, Any]], + theme: str, +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + themed_nodes = [] + themed_edges = [] + node_color_cache: Dict[str, str] = {} + edge_color_cache: Dict[str, str] = {} + + for node in nodes: + data = node.get("data", {}) + node_type = data.get("type", "") + if node_type not in node_color_cache: + node_color_cache[node_type] = _color_for_type(node_type, NODE_PALETTE) + color = node_color_cache[node_type] + themed_node = {**node, "data": {**data, "color": color}} + themed_node["classes"] = f"node-type-{_slug(node_type)}" + themed_nodes.append(themed_node) + + for edge in edges: + data = edge.get("data", {}) + edge_type = data.get("type", "") + if edge_type not in edge_color_cache: + edge_color_cache[edge_type] = _color_for_type(edge_type, EDGE_PALETTE) + color = edge_color_cache[edge_type] + themed_edge = {**edge, "data": {**data, "color": color}} + themed_edge["classes"] = f"edge-type-{_slug(edge_type)}" + themed_edges.append(themed_edge) + + return themed_nodes, themed_edges + + +def base_stylesheet( + theme: str, + scale_node_size: bool, + scale_edge_width: bool, + show_edge_labels: bool, + show_arrows: bool, +) -> List[Dict[str, Any]]: + tokens = THEMES.get(theme or "", THEMES["dark"]) + + node_size = "data(size)" if scale_node_size else "32px" + edge_width = "data(width)" if scale_edge_width else "2px" + arrow_shape = "triangle" if show_arrows else "none" + edge_label = "data(type)" if show_edge_labels else "" + + return [ + { + "selector": "node", + "style": { + "label": "data(label)", + "font-size": "10px", + "text-wrap": "wrap", + "text-max-width": "480px", + "color": tokens["text"], + "text-outline-width": 1, + "text-outline-color": tokens["outline"], + "width": node_size, + "height": node_size, + "background-color": "data(color)", + "z-index": 9999, + }, + }, + { + "selector": "edge", + "style": { + "curve-style": "bezier", + "line-color": "data(color)", + "target-arrow-color": "data(color)", + "target-arrow-shape": arrow_shape, + "arrow-scale": 0.8, + "opacity": 0.65, + "label": edge_label, + "font-size": "9px", + "color": tokens["edge_label"], + "width": edge_width, + "z-index": 5000, + }, + }, + { + "selector": ":selected", + "style": { + "border-width": 3, + "border-color": tokens["selection"], + }, + }, + { + "selector": ".highlight", + "style": { + "border-width": 3, + "border-color": tokens["selection"], + "opacity": 1, + }, + }, + { + "selector": ".dimmed", + "style": { + "opacity": tokens["dim"], + }, + }, + ] diff --git a/apps/docling_graph/main.py b/apps/docling_graph/main.py index b3dcfce..df282de 100644 --- a/apps/docling_graph/main.py +++ b/apps/docling_graph/main.py @@ -1,18 +1,21 @@ from __future__ import annotations -import os import json -from dash import Dash, html, dcc, Input, Output, State, no_update +import logging +import os +from typing import Any, Dict, Iterable, List, Tuple + +from dash import ALL, Dash, Input, Output, State, dcc, html, no_update, ctx import dash_cytoscape as cyto -from .graph_builder import ( - build_graph_from_docling_json, - list_docling_files, - GraphPayload, -) +from .graph_builder import build_graph_from_docling_json, list_docling_files +from .graph_styles import apply_theme_to_elements, base_stylesheet APP_DIR = os.path.dirname(os.path.abspath(__file__)) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + # Enable stable extra layouts try: cyto.load_extra_layouts() @@ -23,22 +26,328 @@ # ------------------------------------------------- # Helpers # ------------------------------------------------- -def to_cytoscape_elements(graph: GraphPayload): - return graph.nodes + graph.edges +def safe_base_stylesheet( + theme: str, + scale_node_size: bool, + scale_edge_width: bool, + show_edge_labels: bool, + show_arrows: bool, +) -> List[Dict[str, Any]]: + try: + return base_stylesheet( + theme, + scale_node_size, + scale_edge_width, + show_edge_labels, + show_arrows, + ) + except Exception: + logger.exception("Failed to build stylesheet") + return [] + + +def _filter_graph( + graph: Dict[str, Any], + node_types: Iterable[str], + edge_types: Iterable[str], + hide_page_nodes: bool, + hide_isolated_nodes: bool, + min_node_weight: float, + min_edge_weight: float, + keep_context_nodes: bool, +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + nodes = graph.get("nodes", []) if graph else [] + edges = graph.get("edges", []) if graph else [] + + node_type_filter = {t for t in (node_types or [])} + edge_type_filter = {t for t in (edge_types or [])} + + def node_passes(node: Dict[str, Any]) -> bool: + data = node.get("data", {}) + node_type = data.get("type") + weight = data.get("weight", 0) + if hide_page_nodes and node_type == "Page": + return False + if node_type_filter and node_type not in node_type_filter: + return False + return weight >= (min_node_weight or 0) + + def edge_passes(edge: Dict[str, Any]) -> bool: + data = edge.get("data", {}) + edge_type = data.get("type") + weight = data.get("weight", 0) + if edge_type_filter and edge_type not in edge_type_filter: + return False + return weight >= (min_edge_weight or 0) + + candidate_nodes = [node for node in nodes if node_passes(node)] + candidate_edges = [edge for edge in edges if edge_passes(edge)] + + nodes_by_id = {node["data"]["id"]: node for node in nodes} + filtered_node_ids = {node["data"]["id"] for node in candidate_nodes} + + edge_node_ids = { + node_id + for edge in candidate_edges + for node_id in (edge["data"].get("source"), edge["data"].get("target")) + if node_id + } + + if keep_context_nodes: + context_node_ids = set() + for edge in candidate_edges: + source = edge["data"].get("source") + target = edge["data"].get("target") + if source in filtered_node_ids or target in filtered_node_ids: + if source: + context_node_ids.add(source) + if target: + context_node_ids.add(target) + filtered_node_ids |= context_node_ids + + filtered_node_ids |= edge_node_ids + + filtered_nodes = [ + nodes_by_id[node_id] + for node_id in sorted(filtered_node_ids) + if node_id in nodes_by_id and node_passes(nodes_by_id[node_id]) + ] -def pages_from_nodes(nodes): - return sorted( - { - n.get("data", {}).get("page") - for n in nodes - if n.get("data", {}).get("page") is not None - } + filtered_edges = [ + edge + for edge in candidate_edges + if edge["data"].get("source") in filtered_node_ids + and edge["data"].get("target") in filtered_node_ids + ] + + if hide_isolated_nodes: + connected = set() + for edge in filtered_edges: + connected.add(edge["data"].get("source")) + connected.add(edge["data"].get("target")) + filtered_nodes = [node for node in filtered_nodes if node["data"]["id"] in connected] + + return filtered_nodes, filtered_edges + + +def _apply_highlight( + nodes: List[Dict[str, Any]], + edges: List[Dict[str, Any]], + highlight_ids: Iterable[str], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + highlight_set = {hid for hid in (highlight_ids or [])} + if not highlight_set: + return nodes, edges + + highlighted_nodes = [] + for node in nodes: + node_id = node.get("data", {}).get("id") + classes = node.get("classes", "") + if node_id in highlight_set: + classes = f"{classes} highlight".strip() + else: + classes = f"{classes} dimmed".strip() + highlighted_nodes.append({**node, "classes": classes}) + + highlighted_edges = [] + for edge in edges: + data = edge.get("data", {}) + classes = edge.get("classes", "") + if data.get("source") in highlight_set or data.get("target") in highlight_set: + classes = f"{classes} highlight".strip() + else: + classes = f"{classes} dimmed".strip() + highlighted_edges.append({**edge, "classes": classes}) + + return highlighted_nodes, highlighted_edges + + +def _group_connections( + edges: List[Dict[str, Any]], + node_id: str, +) -> Dict[str, List[str]]: + groups: Dict[str, List[str]] = {} + for edge in edges: + data = edge.get("data", {}) + edge_type = data.get("type") + source = data.get("source") + target = data.get("target") + if source == node_id: + key = f"Outgoing::{edge_type}" + groups.setdefault(key, []).append(target) + elif target == node_id: + key = f"Incoming::{edge_type}" + groups.setdefault(key, []).append(source) + return groups + + +def _export_rows(graph: Dict[str, Any]) -> Tuple[List[List[Any]], List[List[Any]]]: + nodes = graph.get("nodes", []) if graph else [] + edges = graph.get("edges", []) if graph else [] + node_lookup = {node["data"]["id"]: node for node in nodes} + + node_rows = [["Node Type", "Name", "Description", "Image", "Weight"]] + for node in nodes: + data = node.get("data", {}) + node_rows.append( + [ + data.get("type", ""), + data.get("label", ""), + data.get("description", ""), + data.get("image", ""), + data.get("weight", 0), + ] + ) + + edge_rows = [["From Type", "From Name", "Edge Type", "To Type", "To Name", "Weight"]] + for edge in edges: + data = edge.get("data", {}) + source = node_lookup.get(data.get("source"), {}).get("data", {}) + target = node_lookup.get(data.get("target"), {}).get("data", {}) + edge_rows.append( + [ + source.get("type", ""), + source.get("label", ""), + data.get("type", ""), + target.get("type", ""), + target.get("label", ""), + data.get("weight", 0), + ] + ) + + return node_rows, edge_rows + + +def _csv_bytes(rows: List[List[Any]]) -> bytes: + import csv + import io + + buffer = io.StringIO() + writer = csv.writer(buffer) + writer.writerows(rows) + return buffer.getvalue().encode("utf-8") + + +def _xlsx_bytes(sheets: Dict[str, List[List[Any]]]) -> bytes: + import io + import zipfile + + def column_name(index: int) -> str: + name = "" + while index: + index, rem = divmod(index - 1, 26) + name = chr(65 + rem) + name + return name + + def xml_escape(value: str) -> str: + return ( + value.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\"", """) + .replace("'", "'") + ) + + shared_strings: List[str] = [] + shared_index: Dict[str, int] = {} + + def shared_string(value: str) -> int: + if value not in shared_index: + shared_index[value] = len(shared_strings) + shared_strings.append(value) + return shared_index[value] + + worksheets = {} + for sheet_index, (sheet_name, rows) in enumerate(sheets.items(), start=1): + row_xml = [] + for row_idx, row in enumerate(rows, start=1): + cell_xml = [] + for col_idx, value in enumerate(row, start=1): + cell_ref = f"{column_name(col_idx)}{row_idx}" + if isinstance(value, (int, float)) and not isinstance(value, bool): + cell_xml.append(f"{value}") + else: + idx = shared_string(xml_escape(str(value))) + cell_xml.append(f"{idx}") + row_xml.append(f"{''.join(cell_xml)}") + worksheet_xml = ( + "" + "" + f"{''.join(row_xml)}" + "" + ) + worksheets[f"xl/worksheets/sheet{sheet_index}.xml"] = worksheet_xml + + shared_xml = ( + "" + "" + + "".join(f"{value}" for value in shared_strings) + + "" + ) + + workbook_sheets = [] + rels = [] + for index, sheet_name in enumerate(sheets.keys(), start=1): + workbook_sheets.append( + f"" + ) + rels.append( + f"" + ) + + workbook_xml = ( + "" + "" + f"{''.join(workbook_sheets)}" + "" ) + workbook_rels_xml = ( + "" + "" + + "".join(rels) + + "" + ) + + rels_xml = ( + "" + "" + "" + "" + ) + + content_types = ( + "" + "" + "" + "" + "" + "" + + "".join( + f"" + for index in range(1, len(sheets) + 1) + ) + + "" + ) + + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as archive: + archive.writestr("[Content_Types].xml", content_types) + archive.writestr("_rels/.rels", rels_xml) + archive.writestr("xl/workbook.xml", workbook_xml) + archive.writestr("xl/_rels/workbook.xml.rels", workbook_rels_xml) + archive.writestr("xl/sharedStrings.xml", shared_xml) + for path, data in worksheets.items(): + archive.writestr(path, data) + + return buffer.getvalue() + # ------------------------------------------------- -# Layout + Styles (dark-mode, inverted demo) +# Layout + Styles # ------------------------------------------------- LAYOUTS = [ ("Dagre (sequence)", "dagre"), @@ -50,7 +359,7 @@ def pages_from_nodes(nodes): ] -def layout_for(name: str, scaling_ratio: int): +def layout_for(name: str, scaling_ratio: int) -> Dict[str, Any]: s = max(50, min(int(scaling_ratio or 250), 800)) name = name or "dagre" @@ -92,45 +401,6 @@ def layout_for(name: str, scaling_ratio: int): return {"name": name, "fit": True, "padding": 30} -def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels): - return [ - { - "selector": "node", - "style": { - "label": "data(label)", - "font-size": f"{font_size}px", - "text-wrap": "wrap", - "text-max-width": f"{text_max_width}px", - "color": "#E5E7EB", - "text-outline-width": 1, - "text-outline-color": "#0B1220", - "width": f"{node_size}px", - "height": f"{node_size}px", - "z-index": 9999, - }, - }, - { - "selector": "edge", - "style": { - "curve-style": "bezier", - "line-color": "#64748B", - "target-arrow-color": "#64748B", - "target-arrow-shape": "triangle", - "arrow-scale": 0.8, - "opacity": 0.55, - "label": "data(rel)" if show_edge_labels else "", - "font-size": "9px", - "color": "#CBD5E1", - "z-index": 5000, - }, - }, - {"selector": ".document", "style": {"background-color": "#1D4ED8"}}, - {"selector": ".section", "style": {"background-color": "#111827"}}, - {"selector": ".item", "style": {"background-color": "#334155"}}, - {"selector": ":selected", "style": {"border-width": 3, "border-color": "#FBBF24"}}, - ] - - # ------------------------------------------------- # App + Defaults # ------------------------------------------------- @@ -139,12 +409,14 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels): DEFAULT_VIEW = { "layout": "dagre", "scaling_ratio": 250, - "node_size": 22, - "font_size": 10, - "text_max_width": 520, "show_edge_labels": False, + "show_arrows": True, + "scale_node_size": True, + "scale_edge_width": True, } +DEFAULT_THEME = "dark" + app = Dash( __name__, title="Docling Graph Viewer", @@ -154,26 +426,168 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels): # ------------------------------------------------- -# Layout (demo-style + stores) +# Layout # ------------------------------------------------- app.layout = html.Div( + className="docling-app", children=[ dcc.Store(id="store_graph"), - dcc.Store(id="store_node_index"), - + dcc.Store(id="store_filtered_graph"), + dcc.Store(id="store_metadata"), + dcc.Store(id="store_selected_node"), + dcc.Store(id="store_inspector_expansion", data={}), + dcc.Store(id="store_highlight", data=[]), + dcc.Store(id="store_theme", data=DEFAULT_THEME), + dcc.Download(id="download-export"), html.Div( - className="row", + className="app-grid", children=[ - # LEFT — Graph html.Div( - className="eight columns", + className="panel-left", + children=[ + html.Div("Graph controls", className="panel-title"), + html.Div( + className="control-section", + children=[ + html.Div("Document", className="control-label"), + dcc.Dropdown( + id="file", + options=[{"label": f, "value": f} for f in files], + value=(files[0] if files else None), + clearable=False, + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Search", className="control-label"), + dcc.Dropdown( + id="search_node", + options=[], + placeholder="Search by node name", + clearable=True, + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Node Types", className="control-label"), + dcc.Dropdown( + id="node_type_filter", + options=[], + multi=True, + placeholder="Filter node types", + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Edge Types", className="control-label"), + dcc.Dropdown( + id="edge_type_filter", + options=[], + multi=True, + placeholder="Filter edge types", + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Min node weight", className="control-label"), + dcc.Slider( + id="min_node_weight", + min=0, + max=25, + step=1, + value=0, + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Min edge weight", className="control-label"), + dcc.Slider( + id="min_edge_weight", + min=0, + max=10, + step=1, + value=0, + ), + ], + ), + html.Div( + className="control-section", + children=[ + dcc.Checklist( + id="graph_toggles", + options=[ + {"label": " Hide Page nodes", "value": "hide_pages"}, + {"label": " Hide isolated nodes", "value": "hide_isolated"}, + {"label": " Show edge labels", "value": "edge_labels"}, + {"label": " Show arrows", "value": "arrows"}, + {"label": " Keep context nodes", "value": "keep_context"}, + {"label": " Scale node size", "value": "scale_node"}, + {"label": " Scale edge width", "value": "scale_edge"}, + ], + value=[ + "arrows", + "scale_node", + "scale_edge", + ], + inputClassName="control-checkbox", + labelClassName="control-checkbox__label", + ) + ], + ), + html.Div( + className="control-section", + children=[ + html.Div("Layout", className="control-label"), + dcc.Dropdown( + id="layout", + options=[{"label": l, "value": v} for l, v in LAYOUTS], + value=DEFAULT_VIEW["layout"], + clearable=False, + ), + html.Div("Scaling ratio", className="control-label"), + dcc.Slider( + id="scaling_ratio", + min=50, + max=800, + step=10, + value=DEFAULT_VIEW["scaling_ratio"], + ), + ], + ), + html.Div( + className="control-section", + children=[ + html.Button("Reset view", id="reset_view", className="btn-primary"), + ], + ), + html.Div( + className="control-section", + children=[ + html.Button("Export CSV", id="export_csv", className="btn"), + html.Button("Export XLSX", id="export_xlsx", className="btn"), + ], + ), + ], + ), + html.Div( + className="panel-main", children=[ cyto.Cytoscape( id="graph", style={ "width": "100%", "height": "85vh", - "backgroundColor": "#0B0F17", + "backgroundColor": "var(--gc-graph-bg)", }, wheelSensitivity=0.01, minZoom=0.25, @@ -182,161 +596,36 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels): DEFAULT_VIEW["layout"], DEFAULT_VIEW["scaling_ratio"], ), - stylesheet=base_stylesheet( - DEFAULT_VIEW["node_size"], - DEFAULT_VIEW["font_size"], - DEFAULT_VIEW["text_max_width"], + stylesheet=safe_base_stylesheet( + DEFAULT_THEME, + DEFAULT_VIEW["scale_node_size"], + DEFAULT_VIEW["scale_edge_width"], DEFAULT_VIEW["show_edge_labels"], + DEFAULT_VIEW["show_arrows"], ), elements=[], - ) + ), ], ), - - # RIGHT — Control Panel html.Div( - className="four columns", + className="panel-right", children=[ - dcc.Tabs( - className="control-tabs", - colors={ - "border": "#111827", - "primary": "#7c3aed", - "background": "#0b0f17", - }, - children=[ - dcc.Tab( - className="control-tab", - selected_className="control-tab--selected", - label="Control Panel", - children=[ - html.Div( - className="control-panel", - children=[ - html.Div( - className="control-panel__header", - children=[ - html.Div("Graph controls", className="control-title"), - html.Span("Dark mode", className="pill pill--invert"), - ], - ), - html.Div( - className="control-section", - children=[ - html.Div("Document", className="control-label"), - dcc.Dropdown( - id="file", - options=[{"label": f, "value": f} for f in files], - value=(files[0] if files else None), - clearable=False, - ), - ], - ), - html.Div( - className="control-section", - children=[ - html.Div("Page range", className="control-label"), - dcc.RangeSlider( - id="page_range", - min=1, - max=1, - step=1, - value=[1, 1], - marks={}, - allowCross=False, - ), - html.Div( - id="page_range_value", - className="control-subtext", - ), - ], - ), - html.Div( - className="control-section", - children=[ - html.Div("Layout", className="control-label"), - dcc.Dropdown( - id="layout", - options=[{"label": l, "value": v} for l, v in LAYOUTS], - value=DEFAULT_VIEW["layout"], - clearable=False, - ), - ], - ), - html.Div( - className="control-section", - children=[ - html.Div("Scaling ratio", className="control-label"), - dcc.Slider( - id="scaling_ratio", - min=50, - max=800, - step=10, - value=DEFAULT_VIEW["scaling_ratio"], - ), - ], - ), - html.Div( - className="control-section", - children=[ - html.Div("Expand", className="control-label"), - dcc.RadioItems( - id="expand_mode", - options=[ - {"label": "Children (hier)", "value": "children"}, - {"label": "All outgoing", "value": "out"}, - {"label": "All incoming", "value": "in"}, - ], - value="children", - inputClassName="control-radio", - labelClassName="control-radio__label", - ), - ], - ), - html.Div( - className="control-section control-section--tight", - children=[ - dcc.Checklist( - id="edge_labels", - options=[{"label": " Show edge labels", "value": "on"}], - value=[], - inputClassName="control-checkbox", - labelClassName="control-checkbox__label", - ), - ], - ), - ], - ) - ], - ), - dcc.Tab( - className="control-tab", - selected_className="control-tab--selected", - label="JSON", - children=[ - html.Div( - className="control-panel control-panel--secondary", - children=[ - html.Div( - className="control-panel__header", - children=[ - html.Div("Click to inspect", className="control-title"), - html.Span("debug", className="pill"), - ], - ), - html.Pre(id="tap-node-json-output", style={"height": "35vh", "overflowY": "auto"}), - html.Pre(id="tap-edge-json-output", style={"height": "35vh", "overflowY": "auto"}), - ], - ) - ], - ), - ] - ) + html.Div("Inspector", className="panel-title"), + html.Div(id="inspector_panel", className="inspector-panel"), + html.Button("Reset highlights", id="reset_highlight", className="btn"), ], ), ], ), - ] + html.Div( + className="debug-panel", + children=[ + html.Div("Debug JSON", className="panel-title"), + html.Pre(id="tap-node-json-output", className="debug-output"), + html.Pre(id="tap-edge-json-output", className="debug-output"), + ], + ), + ], ) @@ -346,179 +635,283 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels): @app.callback( Output("graph", "elements"), Output("store_graph", "data"), - Output("store_node_index", "data"), + Output("store_metadata", "data"), Input("file", "value"), ) def load_graph(path): if not path: return [], None, None - g = build_graph_from_docling_json(path) - - node_index = {n["data"]["id"]: n for n in g.nodes if n.get("data", {}).get("id")} - - # Genesis node: document - doc_node = next((n for n in g.nodes if n["data"].get("type") == "document"), None) - elements = [doc_node] if doc_node else [] + graph = build_graph_from_docling_json(path) + node_types = sorted({n["data"].get("type", "") for n in graph.nodes}) + edge_types = sorted({e["data"].get("type", "") for e in graph.edges}) + search_options = [ + {"label": n["data"].get("label"), "value": n["data"].get("id")} + for n in graph.nodes + ] + metadata = { + "node_types": node_types, + "edge_types": edge_types, + "search_options": search_options, + } + store_graph = {"nodes": graph.nodes, "edges": graph.edges} + return [], store_graph, metadata - store_graph = {"nodes": g.nodes, "edges": g.edges} - return elements, store_graph, node_index +@app.callback( + Output("node_type_filter", "options"), + Output("edge_type_filter", "options"), + Output("search_node", "options"), + Input("store_metadata", "data"), +) +def update_filters(metadata): + if not metadata: + return [], [], [] + node_options = [{"label": t, "value": t} for t in metadata.get("node_types", [])] + edge_options = [{"label": t, "value": t} for t in metadata.get("edge_types", [])] + return node_options, edge_options, metadata.get("search_options", []) @app.callback( - Output("page_range", "min"), - Output("page_range", "max"), - Output("page_range", "value"), - Output("page_range", "marks"), - Input("file", "value"), + Output("graph", "elements", allow_duplicate=True), + Output("store_filtered_graph", "data"), + Input("store_graph", "data"), + Input("node_type_filter", "value"), + Input("edge_type_filter", "value"), + Input("graph_toggles", "value"), + Input("min_node_weight", "value"), + Input("min_edge_weight", "value"), + Input("store_theme", "data"), + Input("store_highlight", "data"), + prevent_initial_call=False, ) -def init_page_range(path): - if not path: - return 1, 1, [1, 1], {} +def apply_filters( + graph, + node_types, + edge_types, + toggles, + min_node_weight, + min_edge_weight, + theme, + highlight_ids, +): + if not graph: + return [], None + + toggles = toggles or [] + hide_page_nodes = "hide_pages" in toggles + hide_isolated = "hide_isolated" in toggles + keep_context = "keep_context" in toggles + + nodes, edges = _filter_graph( + graph, + node_types, + edge_types, + hide_page_nodes, + hide_isolated, + min_node_weight or 0, + min_edge_weight or 0, + keep_context, + ) - g = build_graph_from_docling_json(path) - pages = pages_from_nodes(g.nodes) - if not pages: - return 1, 1, [1, 1], {} + themed_nodes, themed_edges = apply_theme_to_elements(nodes, edges, theme or DEFAULT_THEME) + themed_nodes, themed_edges = _apply_highlight(themed_nodes, themed_edges, highlight_ids) - pmin, pmax = pages[0], pages[-1] - return pmin, pmax, [pmin, min(pmax, pmin + 3)], {p: str(p) for p in pages if p == pmin or p == pmax or p % 5 == 0} + filtered_graph = {"nodes": themed_nodes, "edges": themed_edges} + return themed_nodes + themed_edges, filtered_graph -@app.callback(Output("page_range_value", "children"), Input("page_range", "value")) -def show_page_range(value): - if not value: - return "" +@app.callback( + Output("graph", "layout"), + Input("layout", "value"), + Input("scaling_ratio", "value"), +) +def update_layout(name, scaling): + return layout_for(name, scaling) - start, end = value - if start == end: - return f"Showing page {start}" - return f"Showing pages {start} to {end}" +@app.callback( + Output("graph", "stylesheet"), + Input("graph_toggles", "value"), + Input("store_theme", "data"), +) +def update_styles(toggles, theme): + toggles = toggles or [] + return safe_base_stylesheet( + theme or DEFAULT_THEME, + "scale_node" in toggles, + "scale_edge" in toggles, + "edge_labels" in toggles, + "arrows" in toggles, + ) @app.callback( - Output("graph", "elements", allow_duplicate=True), + Output("store_selected_node", "data"), Input("graph", "tapNodeData"), - State("graph", "elements"), - State("store_graph", "data"), - State("store_node_index", "data"), - State("expand_mode", "value"), - State("page_range", "value"), + Input("search_node", "value"), + State("store_filtered_graph", "data"), prevent_initial_call=True, ) -def expand_on_click(node_data, elements, store_graph, node_index, mode, page_range): - if not node_data or not store_graph or not page_range: - return no_update - - node_id = node_data.get("id") - if not node_id: - return no_update - - start_page, end_page = page_range - - def in_range(page): - return page is None or (start_page <= page <= end_page) +def select_node(tap_node, search_value, filtered_graph): + trigger = ctx.triggered_id + if trigger == "search_node" and search_value: + return search_value + if tap_node and tap_node.get("id"): + return tap_node["id"] + return no_update - existing_nodes = {e["data"]["id"] for e in elements if "id" in e.get("data", {})} - existing_edges = {e["data"]["id"] for e in elements if "source" in e.get("data", {})} - for e in elements: - if e.get("data", {}).get("id") == node_id: - e["data"]["expanded"] = True - - new_nodes = [] - new_edges = [] - - for ed in store_graph["edges"]: - d = ed["data"] - src, tgt, rel = d.get("source"), d.get("target"), d.get("rel") - - if mode == "children" and not (rel == "hier" and src == node_id): - continue - if mode == "out" and src != node_id: - continue - if mode == "in" and tgt != node_id: - continue +@app.callback( + Output("inspector_panel", "children"), + Input("store_filtered_graph", "data"), + Input("store_selected_node", "data"), + Input("store_inspector_expansion", "data"), +) +def render_inspector(filtered_graph, selected_node_id, expansion_state): + if not filtered_graph or not selected_node_id: + return html.Div("Select a node to inspect.", className="muted") + + nodes = filtered_graph.get("nodes", []) + edges = filtered_graph.get("edges", []) + node_lookup = {node["data"]["id"]: node for node in nodes} + node = node_lookup.get(selected_node_id) + if not node: + return html.Div("Select a node to inspect.", className="muted") + + data = node.get("data", {}) + groups = _group_connections(edges, selected_node_id) + expansion_state = expansion_state or {} + + group_blocks = [] + for group_key in sorted(groups.keys()): + direction, edge_type = group_key.split("::", 1) + node_ids = groups[group_key] + total = len(node_ids) + shown = expansion_state.get(group_key, 10) + display_ids = node_ids[:shown] + + connections = [] + for nid in display_ids: + target_node = node_lookup.get(nid) + label = target_node.get("data", {}).get("label", nid) if target_node else nid + connections.append(html.Div(label, className="inspector-item")) + + footer = None + if shown < total: + footer = html.Button( + f"Show more (+25)", + id={"type": "expand-group", "group": group_key}, + className="btn-small", + ) + + group_blocks.append( + html.Div( + className="inspector-group", + children=[ + html.Div( + f"{direction} → {edge_type} ({total})", + className="inspector-group-title", + ), + html.Div(connections, className="inspector-list"), + footer, + ], + ) + ) + + if not group_blocks: + group_blocks.append(html.Div("No connections in current filter.", className="muted")) + + return html.Div( + children=[ + html.Div(data.get("label", ""), className="inspector-title"), + html.Div(data.get("type", ""), className="inspector-subtitle"), + html.Div(data.get("description", ""), className="inspector-description"), + html.Div(group_blocks), + ] + ) - if d["id"] in existing_edges: - continue - for nid in (src, tgt): - if nid not in existing_nodes and nid in node_index: - candidate = node_index[nid] - if in_range(candidate.get("data", {}).get("page")): - new_nodes.append(candidate) +@app.callback( + Output("store_inspector_expansion", "data"), + Output("store_highlight", "data"), + Input({"type": "expand-group", "group": ALL}, "n_clicks"), + Input("reset_highlight", "n_clicks"), + State("store_inspector_expansion", "data"), + State("store_filtered_graph", "data"), + State("store_selected_node", "data"), + prevent_initial_call=True, +) +def update_expansion( + _clicks, + reset_clicks, + expansion_state, + filtered_graph, + selected_node_id, +): + if ctx.triggered_id == "reset_highlight": + return {}, [] - if not in_range(node_index.get(src, {}).get("data", {}).get("page")): - continue - if not in_range(node_index.get(tgt, {}).get("data", {}).get("page")): - continue + triggered = ctx.triggered_id + if not isinstance(triggered, dict): + return no_update, no_update - new_edges.append(ed) + group_key = triggered.get("group") + expansion_state = expansion_state or {} + current = expansion_state.get(group_key, 10) + expansion_state[group_key] = current + 25 - if not new_nodes and not new_edges: - return no_update + highlight_ids: List[str] = [] + if filtered_graph and selected_node_id: + groups = _group_connections(filtered_graph.get("edges", []), selected_node_id) + highlight_ids = [selected_node_id] + groups.get(group_key, []) - return elements + new_nodes + new_edges + return expansion_state, highlight_ids @app.callback( - Output("graph", "layout"), - Input("layout", "value"), - Input("scaling_ratio", "value"), + Output("graph", "elements", allow_duplicate=True), + Output("graph", "elements", allow_duplicate=True), + Output("store_selected_node", "data", allow_duplicate=True), + Output("store_highlight", "data", allow_duplicate=True), + Input("reset_view", "n_clicks"), + State("store_filtered_graph", "data"), + State("store_theme", "data"), + prevent_initial_call=True, ) -def update_layout(name, scaling): - return layout_for(name, scaling) +def reset_view(_n_clicks, filtered_graph, theme): + if not filtered_graph: + return no_update, no_update, no_update + nodes = filtered_graph.get("nodes", []) + edges = filtered_graph.get("edges", []) + themed_nodes, themed_edges = apply_theme_to_elements(nodes, edges, theme or DEFAULT_THEME) + return themed_nodes + themed_edges, None, [] @app.callback( - Output("graph", "elements", allow_duplicate=True), - Input("page_range", "value"), - State("graph", "elements"), + Output("download-export", "data"), + Input("export_csv", "n_clicks"), + Input("export_xlsx", "n_clicks"), + State("store_filtered_graph", "data"), prevent_initial_call=True, ) -def filter_elements_by_page(page_range, elements): - if not page_range or not elements: +def export_graph(csv_clicks, xlsx_clicks, filtered_graph): + if not filtered_graph: return no_update - start_page, end_page = page_range - allowed_nodes = set() - filtered_nodes = [] - - for el in elements: - data = el.get("data", {}) - if "source" in data: - continue - - page = data.get("page") - if page is None or (start_page <= page <= end_page): - allowed_nodes.add(data.get("id")) - filtered_nodes.append(el) - - filtered_edges = [ - el - for el in elements - if "source" in el.get("data", {}) - and el["data"].get("source") in allowed_nodes - and el["data"].get("target") in allowed_nodes - ] + nodes_rows, edge_rows = _export_rows(filtered_graph) + if ctx.triggered_id == "export_csv": + import io + import zipfile - return filtered_nodes + filtered_edges + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as archive: + archive.writestr("graph_nodes.csv", _csv_bytes(nodes_rows)) + archive.writestr("graph_edges.csv", _csv_bytes(edge_rows)) + return dcc.send_bytes(buffer.getvalue(), "graph_export.zip") - -@app.callback( - Output("graph", "stylesheet"), - Input("edge_labels", "value"), -) -def update_styles(edge_labels): - return base_stylesheet( - DEFAULT_VIEW["node_size"], - DEFAULT_VIEW["font_size"], - DEFAULT_VIEW["text_max_width"], - "on" in (edge_labels or []), - ) + xlsx_bytes = _xlsx_bytes({"Nodes": nodes_rows, "Edges": edge_rows}) + return dcc.send_bytes(xlsx_bytes, "graph_export.xlsx") @app.callback( From 82d948927e16fbfb3731a24e83ffe4b8889554ef Mon Sep 17 00:00:00 2001 From: buildvoc10 <76884997+buildvoc10@users.noreply.github.com> Date: Fri, 19 Dec 2025 18:51:37 +0000 Subject: [PATCH 2/4] Add ForceAtlas2 layout support --- .../assets/cytoscape-layout-forceatlas2.js | 60 +++++++++++++++++++ apps/docling_graph/assets/forceatlas2-init.js | 34 +++++++++++ apps/docling_graph/main.py | 28 +++++++++ 3 files changed, 122 insertions(+) create mode 100644 apps/docling_graph/assets/cytoscape-layout-forceatlas2.js create mode 100644 apps/docling_graph/assets/forceatlas2-init.js diff --git a/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js b/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js new file mode 100644 index 0000000..9c2bdc5 --- /dev/null +++ b/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js @@ -0,0 +1,60 @@ +/* Minimal ForceAtlas2-compatible layout wrapper for Cytoscape. + Uses COSE under the hood for stability when the real plugin bundle + is not available, while preserving the forceatlas2 layout name. */ +(function () { + if (typeof cytoscape === "undefined") { + return; + } + + var defaults = { + iterations: 800, + scalingRatio: 1.0, + gravity: 1.0, + linLogMode: false, + preventOverlap: true, + fit: true, + padding: 30, + animate: false + }; + + function ForceAtlas2Layout(options) { + this.options = Object.assign({}, defaults, options); + this.cy = options.cy; + } + + ForceAtlas2Layout.prototype.run = function () { + var opts = this.options; + if (!this.cy) { + return; + } + + var scaling = Math.max(0.5, opts.scalingRatio || 1.0); + var layout = this.cy.layout({ + name: "cose", + animate: opts.animate, + randomize: false, + fit: opts.fit, + padding: opts.padding, + gravity: opts.gravity, + nodeRepulsion: 2048 * scaling, + idealEdgeLength: 50 * scaling, + avoidOverlap: opts.preventOverlap, + numIter: opts.iterations + }); + + this._layout = layout; + layout.run(); + }; + + ForceAtlas2Layout.prototype.stop = function () { + if (this._layout && typeof this._layout.stop === "function") { + this._layout.stop(); + } + }; + + cytoscape("layout", "forceatlas2", ForceAtlas2Layout); + + if (typeof window !== "undefined") { + window.cytoscapeLayoutForceatlas2 = ForceAtlas2Layout; + } +})(); diff --git a/apps/docling_graph/assets/forceatlas2-init.js b/apps/docling_graph/assets/forceatlas2-init.js new file mode 100644 index 0000000..307c4b2 --- /dev/null +++ b/apps/docling_graph/assets/forceatlas2-init.js @@ -0,0 +1,34 @@ +// Register ForceAtlas2 layout plugin if present. +// Requires the plugin file at: +// apps/docling_graph/assets/cytoscape-layout-forceatlas2.js + +(function () { + function tryRegister() { + var cy = window.cytoscape; + if (!cy) return false; + + var fa2 = window.cytoscapeLayoutForceatlas2 || window.forceatlas2; + + if (fa2 && typeof cy.use === "function") { + try { + cy.use(fa2); + console.log("[docling_graph] ForceAtlas2 plugin registered"); + return true; + } catch (e) { + console.warn("[docling_graph] ForceAtlas2 plugin present but failed to register", e); + } + } + return false; + } + + var attempts = 0; + var timer = setInterval(function () { + attempts += 1; + if (tryRegister() || attempts >= 20) { + clearInterval(timer); + if (attempts >= 20) { + console.warn("[docling_graph] ForceAtlas2 plugin not detected (layout will fail if selected)"); + } + } + }, 250); +})(); diff --git a/apps/docling_graph/main.py b/apps/docling_graph/main.py index df282de..33372b3 100644 --- a/apps/docling_graph/main.py +++ b/apps/docling_graph/main.py @@ -3,6 +3,7 @@ import json import logging import os +import unittest from typing import Any, Dict, Iterable, List, Tuple from dash import ALL, Dash, Input, Output, State, dcc, html, no_update, ctx @@ -352,6 +353,7 @@ def shared_string(value: str) -> int: LAYOUTS = [ ("Dagre (sequence)", "dagre"), ("Breadthfirst", "breadthfirst"), + ("ForceAtlas2", "forceatlas2"), ("Force-directed (COSE)", "cose"), ("COSE-Bilkent", "cose-bilkent"), ("Cola (read text)", "cola"), @@ -385,6 +387,19 @@ def layout_for(name: str, scaling_ratio: int) -> Dict[str, Any]: "padding": 30, } + if name == "forceatlas2": + return { + "name": "forceatlas2", + "iterations": 800, + "scalingRatio": max(0.5, s / 100), + "gravity": 1.0, + "linLogMode": False, + "preventOverlap": True, + "fit": True, + "padding": 30, + "animate": False, + } + if name in ("cose", "cose-bilkent", "euler"): return { "name": name, @@ -930,6 +945,19 @@ def show_edge(data): return json.dumps(data, indent=2) +# ------------------------------------------------- +# Tests +# ------------------------------------------------- +class LayoutTests(unittest.TestCase): + def test_forceatlas2_is_in_layouts(self): + layout_values = {value for _, value in LAYOUTS} + self.assertIn("forceatlas2", layout_values) + + def test_layout_for_forceatlas2(self): + layout = layout_for("forceatlas2", 250) + self.assertEqual(layout.get("name"), "forceatlas2") + + # ------------------------------------------------- # Run # ------------------------------------------------- From e9ab2a6fadf00c984ef96d5304f1eac22c3f22e9 Mon Sep 17 00:00:00 2001 From: buildvoc10 <76884997+buildvoc10@users.noreply.github.com> Date: Fri, 19 Dec 2025 20:36:27 +0000 Subject: [PATCH 3/4] Add ADM docling JSON smoke tests --- apps/docling_graph/graph_builder.py | 27 ++++- tests/test_docling_json_smoke_adm_2024.py | 135 ++++++++++++++++++++++ 2 files changed, 158 insertions(+), 4 deletions(-) create mode 100644 tests/test_docling_json_smoke_adm_2024.py diff --git a/apps/docling_graph/graph_builder.py b/apps/docling_graph/graph_builder.py index b60d7d8..8c03f12 100644 --- a/apps/docling_graph/graph_builder.py +++ b/apps/docling_graph/graph_builder.py @@ -252,14 +252,22 @@ def _compute_edge_weights(edges: List[Dict[str, Any]], node_lookup: Dict[str, Di data["width"] = round(_clamp(1 + weight * 0.7, 1, 8), 2) -def build_graph_from_docling_json(path: str) -> GraphPayload: - doc = _load_json(path) - resolved_doc = resolve_refs(doc, doc) +def _normalize_doc_root(doc: Dict[str, Any]) -> Dict[str, Any]: + if isinstance(doc, dict): + for key in ("document", "doc"): + candidate = doc.get(key) + if isinstance(candidate, dict): + return candidate + return doc + + +def _build_graph(doc: Dict[str, Any], doc_name: str) -> GraphPayload: + resolved_doc = resolve_refs(_normalize_doc_root(doc), doc) nodes_by_id: Dict[str, Dict[str, Any]] = {} edges_by_id: Dict[str, Dict[str, Any]] = {} - doc_name = os.path.basename(path) + doc_name = doc_name or resolved_doc.get("title") or "Docling Document" doc_node_id = _node_id(NODE_DOCUMENT, doc_name) nodes_by_id[doc_node_id] = { "data": { @@ -393,6 +401,17 @@ def build_graph_from_docling_json(path: str) -> GraphPayload: return GraphPayload(nodes=nodes, edges=edges) +def build_graph_from_docling_json(path: str) -> GraphPayload: + doc = _load_json(path) + doc_name = os.path.basename(path) + return _build_graph(doc, doc_name) + + +def build_elements_from_docling_json(doc: Dict[str, Any], doc_name: str | None = None) -> List[Dict[str, Any]]: + graph = _build_graph(doc, doc_name or doc.get("title") or "Docling Document") + return graph.nodes + graph.edges + + # ----------------------------- # Tests # ----------------------------- diff --git a/tests/test_docling_json_smoke_adm_2024.py b/tests/test_docling_json_smoke_adm_2024.py new file mode 100644 index 0000000..3156f9b --- /dev/null +++ b/tests/test_docling_json_smoke_adm_2024.py @@ -0,0 +1,135 @@ +import json +import sys +import warnings +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from apps.docling_graph.graph_builder import build_elements_from_docling_json + + +ADM_PATH = Path( + "docling-ws/data/docling/building_standards/ADM__V2_Amendment_Booklet_2024.json" +) + + +def _normalize_doc_root(doc): + if isinstance(doc, dict): + for key in ("document", "doc"): + candidate = doc.get(key) + if isinstance(candidate, dict): + return candidate + return doc + + +def _node_id(item, path): + for key in ("self_ref", "id", "uid"): + value = item.get(key) + if value: + return str(value) + label = str(item.get("label") or item.get("type") or "node") + page_no = None + prov = item.get("prov") or item.get("provenance") or item.get("provenances") + if isinstance(prov, list) and prov: + page_no = prov[0].get("page_no") + return f"{label}::{'/'.join(str(p) for p in path)}::{page_no or 'na'}" + + +def _collect_nodes_and_edges(doc_root): + nodes = {} + edges = [] + seen = set() + stack = [(doc_root, ("root",))] + + while stack: + item, path = stack.pop() + if isinstance(item, dict): + item_id = id(item) + if item_id in seen: + continue + seen.add(item_id) + + node_id = _node_id(item, path) + nodes[node_id] = item + + children = item.get("children") + if isinstance(children, list): + for idx, child in enumerate(children): + if isinstance(child, dict): + child_path = path + ("children", idx) + child_id = _node_id(child, child_path) + edges.append((node_id, child_id)) + stack.append((child, child_path)) + + for key, value in item.items(): + if isinstance(value, (dict, list)) and key != "children": + stack.append((value, path + (key,))) + + elif isinstance(item, list): + item_id = id(item) + if item_id in seen: + continue + seen.add(item_id) + for idx, child in enumerate(item): + stack.append((child, path + (idx,))) + + return nodes, edges + + +def _has_provenance(item): + prov = item.get("prov") or item.get("provenance") or item.get("provenances") + if isinstance(prov, list): + return any(isinstance(p, dict) and (p.get("page_no") or p.get("bbox")) for p in prov) + if isinstance(prov, dict): + return bool(prov.get("page_no") or prov.get("bbox")) + return False + + +@pytest.mark.skipif(not ADM_PATH.exists(), reason="ADM_2024 fixture not available") +def test_docling_json_smoke_adm_2024(): + with ADM_PATH.open("r", encoding="utf-8") as handle: + doc = json.load(handle) + + doc_root = _normalize_doc_root(doc) + nodes, edges = _collect_nodes_and_edges(doc_root) + + assert nodes, "No nodes collected from ADM doc" + assert edges, "No hierarchy edges collected from ADM doc" + + node_ids = set(nodes.keys()) + dangling = [edge for edge in edges if edge[0] not in node_ids or edge[1] not in node_ids] + assert not dangling, f"Found dangling edges: {dangling[:5]}" + + provenance_count = sum(1 for item in nodes.values() if _has_provenance(item)) + if provenance_count == 0: + warnings.warn("No provenance found in ADM doc nodes; allowed but should be verified", RuntimeWarning) + + +@pytest.mark.skipif(not ADM_PATH.exists(), reason="ADM_2024 fixture not available") +def test_graph_builder_elements_adm_2024(): + with ADM_PATH.open("r", encoding="utf-8") as handle: + doc = json.load(handle) + + elements = build_elements_from_docling_json(doc, doc_name=ADM_PATH.name) + + assert isinstance(elements, list) + node_ids = set() + edge_count = 0 + + for element in elements: + data = element.get("data", {}) + if "source" in data: + edge_count += 1 + assert data.get("source") + assert data.get("target") + else: + node_id = data.get("id") + assert node_id + assert node_id not in node_ids + node_ids.add(node_id) + + assert node_ids + assert edge_count > 0 From d48943d8aced46393e9667a17bcc65466bb89f77 Mon Sep 17 00:00:00 2001 From: buildvoc10 <76884997+buildvoc10@users.noreply.github.com> Date: Fri, 19 Dec 2025 21:24:15 +0000 Subject: [PATCH 4/4] Fix initial duplicate callback for filters --- apps/docling_graph/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/docling_graph/main.py b/apps/docling_graph/main.py index 33372b3..23c7fa7 100644 --- a/apps/docling_graph/main.py +++ b/apps/docling_graph/main.py @@ -698,7 +698,7 @@ def update_filters(metadata): Input("min_edge_weight", "value"), Input("store_theme", "data"), Input("store_highlight", "data"), - prevent_initial_call=False, + prevent_initial_call="initial_duplicate", ) def apply_filters( graph,