Fix the bug where Python scripts fail to execute PDF text recognition…

… tasks, optimize the logic of judging PDF files, and add cases to the quickstart document for layout analysis.
PaddlePaddle · Apr 24, 2024 · 49646e3 · 49646e3
1 parent 00f0d42
commit 49646e3
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 28 deletions.
diff --git a/paddleocr.py b/paddleocr.py
@@ -561,6 +561,7 @@ def check_img(img, alpha_color=(255, 255, 255)):
         alpha_color: Background color in images in RGBA format
         return: numpy.array (h, w, 3)
     """
+    flag_gif, flag_pdf = False, False
     if isinstance(img, bytes):
         img = img_decode(img)
     if isinstance(img, str):
@@ -589,17 +590,17 @@ def check_img(img, alpha_color=(255, 255, 255)):
                     img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
                 except:
                     logger.error("error in loading image:{}".format(image_file))
-                    return None
+                    return None, flag_gif, flag_pdf
         if img is None:
             logger.error("error in loading image:{}".format(image_file))
-            return None
+            return None, flag_gif, flag_pdf
     # single channel image array.shape:h,w
     if isinstance(img, np.ndarray) and len(img.shape) == 2:
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
     # four channel image array.shape:h,w,c
     if isinstance(img, np.ndarray) and len(img.shape) == 3 and img.shape[2] == 4:
         img = alpha_to_color(img, alpha_color)
-    return img
+    return img, flag_gif, flag_pdf
 
 
 class PaddleOCR(predict_system.TextSystem):
@@ -700,9 +701,9 @@ def ocr(
                 "Since the angle classifier is not initialized, it will not be used during the forward process"
             )
 
-        img = check_img(img, alpha_color)
+        img, flag_gif, flag_pdf = check_img(img, alpha_color)
         # for infer pdf file
-        if isinstance(img, list):
+        if isinstance(img, list) and flag_pdf:
             if self.page_num > len(img) or self.page_num == 0:
                 imgs = img
             else:
@@ -837,7 +838,16 @@ def __call__(
         img_idx=0,
         alpha_color=(255, 255, 255),
     ):
-        img = check_img(img, alpha_color)
+        img, flag_gif, flag_pdf = check_img(img, alpha_color)
+        if isinstance(img, list) and flag_pdf:
+            res_list = []
+            for index, pdf_img in enumerate(img):
+                logger.info("processing {}/{} page:".format(index + 1, len(img)))
+                res, _ = super().__call__(
+                    pdf_img, return_ocr_result_in_table, img_idx=index
+                )
+                res_list.append(res)
+            return res_list
         res, _ = super().__call__(img, return_ocr_result_in_table, img_idx=img_idx)
         return res
 

diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md
@@ -12,10 +12,11 @@
   - [2.2 Python脚本使用](#22-Python脚本使用)
     - [2.2.1 图像方向分类+版面分析+表格识别](#221-图像方向分类版面分析表格识别)
     - [2.2.2 版面分析+表格识别](#222-版面分析表格识别)
-    - [2.2.3 版面分析](#223-版面分析)
-    - [2.2.4 表格识别](#224-表格识别)
-    - [2.2.5 关键信息抽取](#225-关键信息抽取)
-    - [2.2.6 版面恢复](#226-版面恢复)
+    - [2.2.3 版面分析+文本识别](#223-版面分析文本识别)
+    - [2.2.4 版面分析](#224-版面分析)
+    - [2.2.5 表格识别](#225-表格识别)
+    - [2.2.6 关键信息抽取](#226-关键信息抽取)
+    - [2.2.7 版面恢复](#227-版面恢复)
   - [2.3 返回结果说明](#23-返回结果说明)
     - [2.3.1 版面分析+表格识别](#231-版面分析表格识别)
     - [2.3.2 关键信息抽取](#232-关键信息抽取)
@@ -189,7 +190,29 @@ im_show.save('result.jpg')
 ```
 
 <a name="223"></a>
-#### 2.2.3 版面分析
+#### 2.2.3 版面分析+文本识别
+
+```python
+import os
+import cv2
+from paddleocr import PPStructure,save_structure_res
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+result = ocr_engine(img_path)
+for index, res in enumerate(result):
+    save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
+
+for res in result:
+    for line in res:
+        line.pop('img')
+        print(line)
+```
+
+<a name="224"></a>
+#### 2.2.4 版面分析
 
 ```python
 import os
@@ -209,9 +232,9 @@ for line in result:
     print(line)
 ```
 
-<a name="224"></a>
+<a name="225"></a>
 
-#### 2.2.4 表格识别
+#### 2.2.5 表格识别
 
 ```python
 import os
@@ -231,14 +254,14 @@ for line in result:
     print(line)
 ```
 
-<a name="225"></a>
-#### 2.2.5 关键信息抽取
+<a name="226"></a>
+#### 2.2.6 关键信息抽取
 
 关键信息抽取暂不支持通过whl包调用，详细使用教程请参考：[inference文档](./inference.md)。
 
-<a name="226"></a>
+<a name="227"></a>
 
-#### 2.2.6 版面恢复
+#### 2.2.7 版面恢复
 
 ```python
 import os

diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md
@@ -12,10 +12,11 @@
   - [2.2 Use by python script](#22-use-by-python-script)
     - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition)
     - [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition)
-    - [2.2.3 layout analysis](#223-layout-analysis)
-    - [2.2.4 table recognition](#224-table-recognition)
-    - [2.2.5 Key Information Extraction](#225-Key-Information-Extraction)
-    - [2.2.6 layout recovery](#226-layout-recovery)  
+    - [2.2.3 layout analysis + text recognition](#223-layout-analysis--text-recognition)
+    - [2.2.4 layout analysis](#224-layout-analysis)
+    - [2.2.5 table recognition](#225-table-recognition)
+    - [2.2.6 Key Information Extraction](#226-Key-Information-Extraction)
+    - [2.2.7 layout recovery](#227-layout-recovery)  
   - [2.3 Result description](#23-result-description)
     - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition)
     - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction)
@@ -172,7 +173,29 @@ im_show.save('result.jpg')
 ```
 
 <a name="223"></a>
-#### 2.2.3 layout analysis
+#### 2.2.3 layout analysis + text recognition
+
+```python
+import os
+import cv2
+from paddleocr import PPStructure,save_structure_res
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+result = ocr_engine(img_path)
+for index, res in enumerate(result):
+    save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
+
+for res in result:
+    for line in res:
+        line.pop('img')
+        print(line)
+```
+
+<a name="224"></a>
+#### 2.2.4 layout analysis
 
 ```python
 import os
@@ -192,8 +215,8 @@ for line in result:
     print(line)
 ```
 
-<a name="224"></a>
-#### 2.2.4 table recognition
+<a name="225"></a>
+#### 2.2.5 table recognition
 
 ```python
 import os
@@ -213,13 +236,13 @@ for line in result:
     print(line)
 ```
 
-<a name="225"></a>
-#### 2.2.5 Key Information Extraction
+<a name="226"></a>
+#### 2.2.6 Key Information Extraction
 
 Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md).
 
-<a name="226"></a>
-#### 2.2.6 layout recovery
+<a name="227"></a>
+#### 2.2.7 layout recovery
 
 ```python
 import os