diff --git a/libs/kotaemon/kotaemon/indices/qa/citation_qa.py b/libs/kotaemon/kotaemon/indices/qa/citation_qa.py
index 37d8ced53..efe34a282 100644
--- a/libs/kotaemon/kotaemon/indices/qa/citation_qa.py
+++ b/libs/kotaemon/kotaemon/indices/qa/citation_qa.py
@@ -334,11 +334,19 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
highlight_text = ""
ss = sorted(ss, key=lambda x: x["start"])
+ last_end = 0
text = cur_doc.text[: ss[0]["start"]]
+
for idx, span in enumerate(ss):
- to_highlight = cur_doc.text[span["start"] : span["end"]]
- if len(to_highlight) > len(highlight_text):
- highlight_text = to_highlight
+ # prevent overlapping between span
+ span_start = max(last_end, span["start"])
+ span_end = max(last_end, span["end"])
+
+ to_highlight = cur_doc.text[span_start:span_end]
+ last_end = span_end
+
+ # append to highlight on PDF viewer
+ highlight_text += (" " if highlight_text else "") + to_highlight
span_idx = span.get("idx", None)
if span_idx is not None:
@@ -350,6 +358,7 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
)
if idx < len(ss) - 1:
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]
+
text += cur_doc.text[ss[-1]["end"] :]
# add to display list
with_citation.append(
diff --git a/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py b/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py
index 17e94e0e3..aa4706529 100644
--- a/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py
+++ b/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py
@@ -152,6 +152,7 @@ def answer_to_citations(self, answer) -> list[InlineEvidence]:
def replace_citation_with_link(self, answer: str):
# Define the regex pattern to match 【number】
pattern = r"【\d+】"
+ alternate_pattern = r"\[\d+\]"
# Regular expression to match merged citations
multi_pattern = r"【([\d,\s]+)】"
@@ -166,7 +167,9 @@ def split_citations(match):
answer = re.sub(multi_pattern, split_citations, answer)
# Find all citations in the answer
- matches = re.finditer(pattern, answer)
+ matches = list(re.finditer(pattern, answer))
+ if not matches:
+ matches = list(re.finditer(alternate_pattern, answer))
matched_citations = set()
for match in matches:
@@ -174,11 +177,12 @@ def split_citations(match):
matched_citations.add(citation)
for citation in matched_citations:
+ citation_id = citation[1:-1]
answer = answer.replace(
citation,
(
"{citation}"
+ f"id='mark-{citation_id}'>【{citation_id}】"
),
)
diff --git a/libs/kotaemon/kotaemon/indices/qa/utils.py b/libs/kotaemon/kotaemon/indices/qa/utils.py
index 51602b805..d64fb209e 100644
--- a/libs/kotaemon/kotaemon/indices/qa/utils.py
+++ b/libs/kotaemon/kotaemon/indices/qa/utils.py
@@ -5,17 +5,38 @@ def find_text(search_span, context, min_length=5):
sentence_list = search_span.split("\n")
context = context.replace("\n", " ")
- matches = []
+ matches_span = []
# don't search for small text
if len(search_span) > min_length:
for sentence in sentence_list:
- match = SequenceMatcher(
- None, sentence, context, autojunk=False
- ).find_longest_match()
- if match.size > max(len(sentence) * 0.35, min_length):
- matches.append((match.b, match.b + match.size))
+ match_results = SequenceMatcher(
+ None,
+ sentence,
+ context,
+ autojunk=False,
+ ).get_matching_blocks()
+
+ matched_blocks = []
+ for _, start, length in match_results:
+ if length > max(len(sentence) * 0.2, min_length):
+ matched_blocks.append((start, start + length))
+
+ if matched_blocks:
+ start_index = min(start for start, _ in matched_blocks)
+ end_index = max(end for _, end in matched_blocks)
+ length = end_index - start_index
+
+ if length > max(len(sentence) * 0.35, min_length):
+ matches_span.append((start_index, end_index))
+
+ if matches_span:
+ # merge all matches into one span
+ final_span = min(start for start, _ in matches_span), max(
+ end for _, end in matches_span
+ )
+ matches_span = [final_span]
- return matches
+ return matches_span
def find_start_end_phrase(
diff --git a/libs/ktem/ktem/assets/css/main.css b/libs/ktem/ktem/assets/css/main.css
index 82689a597..95b1f6349 100644
--- a/libs/ktem/ktem/assets/css/main.css
+++ b/libs/ktem/ktem/assets/css/main.css
@@ -277,7 +277,6 @@ span.icon {
}
pdfjs-viewer-element {
- height: 100vh;
height: 100dvh;
}
@@ -290,9 +289,8 @@ pdfjs-viewer-element {
left: 0;
top: 0;
width: 100%;
- height: 100%;
- overflow: auto;
- background-color: rgb(0, 0, 0);
+ height: 85dvh;
+ overflow: hidden;
background-color: rgba(0, 0, 0, 0.4);
}
@@ -302,7 +300,7 @@ pdfjs-viewer-element {
.modal-content {
background-color: #fefefe;
- height: 110%;
+ height: 100%;
display: flex;
flex-direction: column;
}
@@ -323,7 +321,7 @@ pdfjs-viewer-element {
.modal-body {
flex: 1;
- overflow: auto;
+ overflow: hidden;
}
/* Switch checkbox styles */
diff --git a/libs/ktem/ktem/assets/js/main.js b/libs/ktem/ktem/assets/js/main.js
index 7a9445d16..ad3c99117 100644
--- a/libs/ktem/ktem/assets/js/main.js
+++ b/libs/ktem/ktem/assets/js/main.js
@@ -32,7 +32,6 @@ function run() {
globalThis.toggleChatColumn = (() => {
/* get flex-grow value of chat_column */
let flex_grow = conv_column.style.flexGrow;
- console.log("chat col", flex_grow);
if (flex_grow == '0') {
conv_column.style.flexGrow = '1';
conv_column.style.minWidth = default_conv_column_min_width;
@@ -95,10 +94,24 @@ function run() {
event.preventDefault(); // Prevent the default link behavior
var citationId = event.target.getAttribute('id');
- await sleep(100); // Sleep for 500 milliseconds
+ await sleep(100); // Sleep for 100 milliseconds
+
+ // check if modal is open
+ var modal = document.getElementById("pdf-modal");
var citation = document.querySelector('mark[id="' + citationId + '"]');
- if (citation) {
- citation.scrollIntoView({ behavior: 'smooth' });
+
+ if (modal.style.display == "block") {
+ // trigger on click event of PDF Preview link
+ var detail_elem = citation;
+ // traverse up the DOM tree to find the parent element with tag detail
+ while (detail_elem.tagName.toLowerCase() != "details") {
+ detail_elem = detail_elem.parentElement;
+ }
+ detail_elem.getElementsByClassName("pdf-link").item(0).click();
+ } else {
+ if (citation) {
+ citation.scrollIntoView({ behavior: 'smooth' });
+ }
}
}
}
diff --git a/libs/ktem/ktem/assets/js/pdf_viewer.js b/libs/ktem/ktem/assets/js/pdf_viewer.js
index 63c600e8b..2166edbec 100644
--- a/libs/ktem/ktem/assets/js/pdf_viewer.js
+++ b/libs/ktem/ktem/assets/js/pdf_viewer.js
@@ -43,16 +43,52 @@ function onBlockLoad () {
modal.style.position = "fixed";
modal.style.width = "70%";
modal.style.left = "15%";
+ modal.style.height = "100dvh";
} else {
modal.style.position = old_position;
modal.style.width = old_width;
modal.style.left = old_left;
+ modal.style.height = "85dvh";
}
};
}
+ globalThis.compareText = (search_phrase, page_label) => {
+ var iframe = document.querySelector("#pdf-viewer").iframe;
+ var innerDoc = (iframe.contentDocument) ? iframe.contentDocument : iframe.contentWindow.document;
+
+ var query_selector = (
+ "#viewer > div[data-page-number='" +
+ page_label +
+ "'] > div.textLayer > span"
+ );
+ var page_spans = innerDoc.querySelectorAll(query_selector);
+ for (var i = 0; i < page_spans.length; i++) {
+ var span = page_spans[i];
+ if (
+ span.textContent.length > 4 &&
+ (
+ search_phrase.includes(span.textContent) ||
+ span.textContent.includes(search_phrase)
+ )
+ ) {
+ span.innerHTML = "" + span.textContent + "";
+ } else {
+ // if span is already highlighted, remove it
+ if (span.querySelector(".highlight")) {
+ span.innerHTML = span.textContent;
+ }
+ }
+ }
+ }
+
+ // Sleep function using Promise and setTimeout
+ function sleep(ms) {
+ return new Promise(resolve => setTimeout(resolve, ms));
+ }
+
// Function to open modal and display PDF
- globalThis.openModal = (event) => {
+ globalThis.openModal = async (event) => {
event.preventDefault();
var target = event.currentTarget;
var src = target.getAttribute("data-src");
@@ -66,8 +102,8 @@ function onBlockLoad () {
if (current_src != src) {
pdfViewer.setAttribute("src", src);
}
- pdfViewer.setAttribute("phrase", phrase);
- pdfViewer.setAttribute("search", search);
+ // pdfViewer.setAttribute("phrase", phrase);
+ // pdfViewer.setAttribute("search", search);
pdfViewer.setAttribute("page", page);
var scrollableDiv = document.getElementById("chat-info-panel");
@@ -80,6 +116,10 @@ function onBlockLoad () {
info_panel.style.display = "none";
}
scrollableDiv.scrollTop = 0;
+
+ /* search for text inside PDF page */
+ await sleep(500);
+ compareText(search, page);
}
globalThis.assignPdfOnclickEvent = () => {
@@ -93,7 +133,6 @@ function onBlockLoad () {
var created_modal = document.getElementById("pdf-viewer");
if (!created_modal) {
createModal();
- console.log("Created modal")
}
}