diff --git a/libs/kotaemon/kotaemon/indices/qa/citation_qa.py b/libs/kotaemon/kotaemon/indices/qa/citation_qa.py index 37d8ced53..efe34a282 100644 --- a/libs/kotaemon/kotaemon/indices/qa/citation_qa.py +++ b/libs/kotaemon/kotaemon/indices/qa/citation_qa.py @@ -334,11 +334,19 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document highlight_text = "" ss = sorted(ss, key=lambda x: x["start"]) + last_end = 0 text = cur_doc.text[: ss[0]["start"]] + for idx, span in enumerate(ss): - to_highlight = cur_doc.text[span["start"] : span["end"]] - if len(to_highlight) > len(highlight_text): - highlight_text = to_highlight + # prevent overlapping between span + span_start = max(last_end, span["start"]) + span_end = max(last_end, span["end"]) + + to_highlight = cur_doc.text[span_start:span_end] + last_end = span_end + + # append to highlight on PDF viewer + highlight_text += (" " if highlight_text else "") + to_highlight span_idx = span.get("idx", None) if span_idx is not None: @@ -350,6 +358,7 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document ) if idx < len(ss) - 1: text += cur_doc.text[span["end"] : ss[idx + 1]["start"]] + text += cur_doc.text[ss[-1]["end"] :] # add to display list with_citation.append( diff --git a/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py b/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py index 17e94e0e3..aa4706529 100644 --- a/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py +++ b/libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py @@ -152,6 +152,7 @@ def answer_to_citations(self, answer) -> list[InlineEvidence]: def replace_citation_with_link(self, answer: str): # Define the regex pattern to match 【number】 pattern = r"【\d+】" + alternate_pattern = r"\[\d+\]" # Regular expression to match merged citations multi_pattern = r"【([\d,\s]+)】" @@ -166,7 +167,9 @@ def split_citations(match): answer = re.sub(multi_pattern, split_citations, answer) # Find all citations in the answer - matches = re.finditer(pattern, answer) + matches = list(re.finditer(pattern, answer)) + if not matches: + matches = list(re.finditer(alternate_pattern, answer)) matched_citations = set() for match in matches: @@ -174,11 +177,12 @@ def split_citations(match): matched_citations.add(citation) for citation in matched_citations: + citation_id = citation[1:-1] answer = answer.replace( citation, ( "{citation}" + f"id='mark-{citation_id}'>【{citation_id}】" ), ) diff --git a/libs/kotaemon/kotaemon/indices/qa/utils.py b/libs/kotaemon/kotaemon/indices/qa/utils.py index 51602b805..d64fb209e 100644 --- a/libs/kotaemon/kotaemon/indices/qa/utils.py +++ b/libs/kotaemon/kotaemon/indices/qa/utils.py @@ -5,17 +5,38 @@ def find_text(search_span, context, min_length=5): sentence_list = search_span.split("\n") context = context.replace("\n", " ") - matches = [] + matches_span = [] # don't search for small text if len(search_span) > min_length: for sentence in sentence_list: - match = SequenceMatcher( - None, sentence, context, autojunk=False - ).find_longest_match() - if match.size > max(len(sentence) * 0.35, min_length): - matches.append((match.b, match.b + match.size)) + match_results = SequenceMatcher( + None, + sentence, + context, + autojunk=False, + ).get_matching_blocks() + + matched_blocks = [] + for _, start, length in match_results: + if length > max(len(sentence) * 0.2, min_length): + matched_blocks.append((start, start + length)) + + if matched_blocks: + start_index = min(start for start, _ in matched_blocks) + end_index = max(end for _, end in matched_blocks) + length = end_index - start_index + + if length > max(len(sentence) * 0.35, min_length): + matches_span.append((start_index, end_index)) + + if matches_span: + # merge all matches into one span + final_span = min(start for start, _ in matches_span), max( + end for _, end in matches_span + ) + matches_span = [final_span] - return matches + return matches_span def find_start_end_phrase( diff --git a/libs/ktem/ktem/assets/css/main.css b/libs/ktem/ktem/assets/css/main.css index 82689a597..95b1f6349 100644 --- a/libs/ktem/ktem/assets/css/main.css +++ b/libs/ktem/ktem/assets/css/main.css @@ -277,7 +277,6 @@ span.icon { } pdfjs-viewer-element { - height: 100vh; height: 100dvh; } @@ -290,9 +289,8 @@ pdfjs-viewer-element { left: 0; top: 0; width: 100%; - height: 100%; - overflow: auto; - background-color: rgb(0, 0, 0); + height: 85dvh; + overflow: hidden; background-color: rgba(0, 0, 0, 0.4); } @@ -302,7 +300,7 @@ pdfjs-viewer-element { .modal-content { background-color: #fefefe; - height: 110%; + height: 100%; display: flex; flex-direction: column; } @@ -323,7 +321,7 @@ pdfjs-viewer-element { .modal-body { flex: 1; - overflow: auto; + overflow: hidden; } /* Switch checkbox styles */ diff --git a/libs/ktem/ktem/assets/js/main.js b/libs/ktem/ktem/assets/js/main.js index 7a9445d16..ad3c99117 100644 --- a/libs/ktem/ktem/assets/js/main.js +++ b/libs/ktem/ktem/assets/js/main.js @@ -32,7 +32,6 @@ function run() { globalThis.toggleChatColumn = (() => { /* get flex-grow value of chat_column */ let flex_grow = conv_column.style.flexGrow; - console.log("chat col", flex_grow); if (flex_grow == '0') { conv_column.style.flexGrow = '1'; conv_column.style.minWidth = default_conv_column_min_width; @@ -95,10 +94,24 @@ function run() { event.preventDefault(); // Prevent the default link behavior var citationId = event.target.getAttribute('id'); - await sleep(100); // Sleep for 500 milliseconds + await sleep(100); // Sleep for 100 milliseconds + + // check if modal is open + var modal = document.getElementById("pdf-modal"); var citation = document.querySelector('mark[id="' + citationId + '"]'); - if (citation) { - citation.scrollIntoView({ behavior: 'smooth' }); + + if (modal.style.display == "block") { + // trigger on click event of PDF Preview link + var detail_elem = citation; + // traverse up the DOM tree to find the parent element with tag detail + while (detail_elem.tagName.toLowerCase() != "details") { + detail_elem = detail_elem.parentElement; + } + detail_elem.getElementsByClassName("pdf-link").item(0).click(); + } else { + if (citation) { + citation.scrollIntoView({ behavior: 'smooth' }); + } } } } diff --git a/libs/ktem/ktem/assets/js/pdf_viewer.js b/libs/ktem/ktem/assets/js/pdf_viewer.js index 63c600e8b..2166edbec 100644 --- a/libs/ktem/ktem/assets/js/pdf_viewer.js +++ b/libs/ktem/ktem/assets/js/pdf_viewer.js @@ -43,16 +43,52 @@ function onBlockLoad () { modal.style.position = "fixed"; modal.style.width = "70%"; modal.style.left = "15%"; + modal.style.height = "100dvh"; } else { modal.style.position = old_position; modal.style.width = old_width; modal.style.left = old_left; + modal.style.height = "85dvh"; } }; } + globalThis.compareText = (search_phrase, page_label) => { + var iframe = document.querySelector("#pdf-viewer").iframe; + var innerDoc = (iframe.contentDocument) ? iframe.contentDocument : iframe.contentWindow.document; + + var query_selector = ( + "#viewer > div[data-page-number='" + + page_label + + "'] > div.textLayer > span" + ); + var page_spans = innerDoc.querySelectorAll(query_selector); + for (var i = 0; i < page_spans.length; i++) { + var span = page_spans[i]; + if ( + span.textContent.length > 4 && + ( + search_phrase.includes(span.textContent) || + span.textContent.includes(search_phrase) + ) + ) { + span.innerHTML = "" + span.textContent + ""; + } else { + // if span is already highlighted, remove it + if (span.querySelector(".highlight")) { + span.innerHTML = span.textContent; + } + } + } + } + + // Sleep function using Promise and setTimeout + function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + // Function to open modal and display PDF - globalThis.openModal = (event) => { + globalThis.openModal = async (event) => { event.preventDefault(); var target = event.currentTarget; var src = target.getAttribute("data-src"); @@ -66,8 +102,8 @@ function onBlockLoad () { if (current_src != src) { pdfViewer.setAttribute("src", src); } - pdfViewer.setAttribute("phrase", phrase); - pdfViewer.setAttribute("search", search); + // pdfViewer.setAttribute("phrase", phrase); + // pdfViewer.setAttribute("search", search); pdfViewer.setAttribute("page", page); var scrollableDiv = document.getElementById("chat-info-panel"); @@ -80,6 +116,10 @@ function onBlockLoad () { info_panel.style.display = "none"; } scrollableDiv.scrollTop = 0; + + /* search for text inside PDF page */ + await sleep(500); + compareText(search, page); } globalThis.assignPdfOnclickEvent = () => { @@ -93,7 +133,6 @@ function onBlockLoad () { var created_modal = document.getElementById("pdf-viewer"); if (!created_modal) { createModal(); - console.log("Created modal") } }