-
Notifications
You must be signed in to change notification settings - Fork 1
/
scannedpdftext.py
42 lines (32 loc) · 1.22 KB
/
scannedpdftext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
class ScannedPDFText():
def _init_():
pass
def textconvert(self, PDF_FILE):
# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_FILE, 100)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
'''
Part #2 - Recognizing text from the images using OCR
'''
# Variable to get count of total number of pages
filelimit = image_counter-1
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
return text