Not able to fetch all tables and figures when converting pdf into images #25

reema93jain · 2024-01-25T00:04:52Z

Hi Team,

I am using layoutparser for detecting tables and images.
When I just try to run code on individual png image file, model detects tables and figures correctly.
However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.

Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!

!pip install layoutparser
!pip install opencv-python numpy matplotlib

install detectron2:

!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip3 install pdf2image
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!apt-get install poppler-utils

import os
from pdf2image import convert_from_path
import shutil
import cv2

import layoutparser as lp

PubLayNet

model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

def save_detections(table_blocks, image, image_name, save_dir='/content/'):
for j in range(len(table_blocks)):
x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2
cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]
cv2_imshow(cropped)
file_name = image_name+'_'+str(j)+'.jpg'
status = cv2.imwrite(save_dir+file_name, cropped)
if status:
print("Saved ", file_name)

def inference(images_dir):
table_blocks_list = []
# Getting images from the directory
for file in os.listdir(images_dir):
if file.endswith(".jpg"):
# Extract the image name (excluding the extension)
image_name = file[:-4]
# # Reading the image using OpenCV
image = cv2.imread(images_dir+'/'+file)
# OpenCV reads images in BGR format, convert to RGB
image = image[..., ::-1]
# Running Inference
layout = model.detect(image)

        # Extracting Tables
        table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
        figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])

        table_blocks = lp.Layout([b for b in table_blocks \
               if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
        h, w = image.shape[:2]

        left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

        left_blocks = table_blocks.filter_by(left_interval, center=True)
        left_blocks.sort(key = lambda b:b.coordinates[1])

        right_blocks = [b for b in table_blocks if b not in left_blocks]
        right_blocks.sort(key = lambda b:b.coordinates[1])

        # And finally combine the two list and add the index
        # according to the order
        table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

        save_detections(table_blocks, image, image_name)

        table_blocks_list.append(table_blocks)
return table_blocks_list

def pdf_inference(pdfName):
# Converting each page to an image
# Get the current working directory
path = os.getcwd()
# Construct the full path to the PDF file
PDF_file = path + "/" + pdfName
# Create a directory to store converted images
if os.path.exists(path+'/pdf_images'):
shutil.rmtree(path+'/pdf_images')
os.mkdir(path+'/pdf_images')

# Convert each page of the PDF to an image
pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
image_counter = 1

# Iterate over the pages
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"

    # st.write(filename)
    filepath = path+"/pdf_images/" + filename
    
    # Save the page as a JPEG image in the 'pdf_images' directory
    page.save(f'{filepath}', 'JPEG')
    image_counter = image_counter + 1

#filelimit = image_counter-1

# Running inference on the images
table_blocks_list = inference(path+'/pdf_images')

#return table_blocks_list

test = pdf_inference('abc-Datasheet.pdf')

Thanks
Reema Jain

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Not able to fetch all tables and figures when converting pdf into images #25

Not able to fetch all tables and figures when converting pdf into images #25

reema93jain commented Jan 25, 2024

Not able to fetch all tables and figures when converting pdf into images #25

Not able to fetch all tables and figures when converting pdf into images #25

Comments

reema93jain commented Jan 25, 2024

install detectron2:

PubLayNet