[RELEASE] Morpheus v24.06.01 (#1863)

## ❄️ Code freeze for `branch-24.06` and `v24.06` release ### What does this mean? Only critical/hotfix level issues should be merged into `branch-24.06` until release (merging of this PR). All other development PRs should be retargeted towards the next release branch: `branch-24.10`. ### What is the purpose of this PR? - Update documentation - Allow testing for the new release - Enable a means to merge `branch-24.06` into `main` for the release
nv-morpheus · Aug 24, 2024 · b18188a · b18188a
2 parents 7b9d948 + 951b7ec
commit b18188a
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 20 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -43,6 +43,14 @@ permissions:
 
 jobs:
 
+  pr-builder:
+    needs:
+      - prepare
+      - checks
+      - ci_pipe
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
+
   prepare:
     # Executes the get-pr-info action to determine if the PR has the skip-ci label, if the action fails we assume the
     # PR does not have the label
@@ -91,13 +99,3 @@ jobs:
       test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-240614
     secrets:
       NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-
-  pr-builder:
-    # Always run this step even if others are skipped or cancelled
-    if: '!cancelled()'
-    needs:
-      - prepare
-      - checks
-      - ci_pipe
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
+
+# Morpheus 24.06.01 (23 Aug 2024)
+
+## 🛠️ Improvements
+- Replace pdf parsing libs ([#1861](https://github.com/nv-morpheus/Morpheus/pull/1861)) [@dagardner-nv](https://github.com/dagardner-nv)
+
 # Morpheus 24.06.00 (03 Jul 2024)
 ## 🚨 Breaking Changes
 

diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml
@@ -83,6 +83,7 @@ dependencies:
 - pydantic
 - pylint=3.0.3
 - pypdf=3.17.4
+- pypdfium2=4.30
 - pytest-asyncio
 - pytest-benchmark=4.0
 - pytest-cov
@@ -120,7 +121,6 @@ dependencies:
 - pip:
   - --find-links https://data.dgl.ai/wheels-test/repo.html
   - --find-links https://data.dgl.ai/wheels/cu121/repo.html
-  - PyMuPDF==1.23.*
   - databricks-cli < 0.100
   - databricks-connect
   - dgl==2.0.0

diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml
@@ -67,6 +67,7 @@ dependencies:
 - pybind11-stubgen=0.10.5
 - pydantic
 - pylint=3.0.3
+- pypdfium2=4.30
 - pytest-asyncio
 - pytest-benchmark=4.0
 - pytest-cov
@@ -98,7 +99,6 @@ dependencies:
 - yapf=0.40.1
 - zlib=1.2.13
 - pip:
-  - PyMuPDF==1.23.*
   - databricks-cli < 0.100
   - databricks-connect
   - milvus==2.3.5

diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml
@@ -44,6 +44,7 @@ dependencies:
 - pluggy=1.3
 - pydantic
 - pypdf=3.17.4
+- pypdfium2=4.30
 - python-confluent-kafka>=1.9.2,<1.10.0a0
 - python-docx==1.1.0
 - python-graphviz
@@ -67,7 +68,6 @@ dependencies:
 - pip:
   - --find-links https://data.dgl.ai/wheels-test/repo.html
   - --find-links https://data.dgl.ai/wheels/cu121/repo.html
-  - PyMuPDF==1.23.*
   - databricks-cli < 0.100
   - databricks-connect
   - dgl==2.0.0

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -364,14 +364,14 @@ dependencies:
       - output_types: [conda]
         packages:
           - &nodejs nodejs=18.*
+          - &pypdfium2 pypdfium2=4.30
           - pytest-asyncio
           - pytest-benchmark=4.0
           - pytest-cov
           - pytest=7.4.4
           - &python-docx python-docx==1.1.0
           - pip
           - pip:
-              - &PyMuPDF PyMuPDF==1.23.*
               - pytest-kafka==0.6.0
 
   example-dfp-prod:
@@ -410,6 +410,7 @@ dependencies:
           - onnx=1.15
           - openai=1.13
           - pypdf=3.17.4
+          - *pypdfium2
           - *python-docx
           - requests-toolbelt=1.0 # Transitive dep needed by nemollm, specified here to ensure we get a compatible version
           - sentence-transformers=2.7
@@ -420,7 +421,6 @@ dependencies:
             - faiss-gpu==1.7.*
             - google-search-results==2.4
             - nemollm==0.3.5
-            - *PyMuPDF
 
   model-training-tuning:
     common:

diff --git a/examples/llm/vdb_upload/module/content_extractor_module.py b/examples/llm/vdb_upload/module/content_extractor_module.py
@@ -22,11 +22,11 @@
 from typing import Dict
 from typing import List
 
-import fitz
 import fsspec
 import mrc
 import mrc.core.operators as ops
 import pandas as pd
+import pypdfium2 as libpdfium
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pydantic import BaseModel  # pylint: disable=no-name-in-module
@@ -172,10 +172,13 @@ def wrapper(input_info: ConverterInputInfo, *args, **kwargs):
 @_converter_error_handler
 def _pdf_to_text_converter(input_info: ConverterInputInfo) -> str:
     text = ""
-    pdf_document = fitz.open(stream=input_info.io_bytes, filetype="pdf")
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document[page_num]
-        text += page.get_text()
+    pdf_document = libpdfium.PdfDocument(input_info.io_bytes)
+    for page_idx in range(len(pdf_document)):
+        page = pdf_document.get_page(page_idx)
+        textpage = page.get_textpage()
+        page_text = textpage.get_text_bounded()
+        text += page_text
+
     return text