-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2csv.py
184 lines (147 loc) · 6.47 KB
/
pdf2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
""" This is the core of the program, don't meddle with it unless you know what you
are doing. Any contribution would be appreciated. The program works in the following steps
Step 1 - PDF to PNG conversion
Step 2 - PNG to TXT conversion
Step 3 - TXT to CSV conversion involving all the formatting
"""
# -------------------------------------------------------------
# Importing relevant libraries
import pytesseract
from PIL import Image
import os
from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import csv
import argparse
from configparser import ConfigParser
import pathlib
# Argparse implementation
parser = argparse.ArgumentParser(description='Convert the PDF file')
parser.add_argument('-i', '--input', type=str, metavar='', help='Enter here your input file path')
parser.add_argument('-p', '--password', type=str, metavar='',
help='Enter the password here if your pdf is password protected')
parser.add_argument('-fp', '--firstpage', type=int, metavar='', help='Enter the first page you want to convert')
parser.add_argument('-lp', '--lastpage', type=int, metavar='', help='Enter the last page you want to convert')
args = parser.parse_args()
totalImages = 0
# This is basically the main() function
def pdf_to_csv(filename, first_page, last_page, userpw):
global totalImages
print_header()
# Step 1 - PDF to PNG
pdf_to_png(filename, first_page, last_page, userpw, popplerLoc=load_config()[0])
# Step 2 - PNG to TXT
for i in range(totalImages):
filename = "images/image" + str(i) + ".png"
png_to_txt(tesseractLoc=load_config()[1], filename=filename)
# Step 3 - TXT to CSV
for i in range(totalImages):
filename = "texts/text" + str(i) + ".txt"
txt_to_csv(filename)
# this function will get us the path everytime
def get_path_of_source(filename):
p = pathlib.Path(filename)
return p
# simple print function which prints header
def print_header():
print("|----------------------------------------|")
print("|---------PDF to CSV Converter-----------|")
print("|----------------------------------------|")
## ConfigParser implementation
def load_config():
config = ConfigParser()
config.read('config.ini')
popplerLoc = config.get('settings', 'PopplerPath')
tesseractLoc = config.get('settings', 'TesseractPath')
return popplerLoc, tesseractLoc
""" STEP 1 - Conversion of the pdf to img using pdf2image """
def pdf_to_png(filename, firstpage, lastpage, userpw, popplerLoc):
global totalImages
images = convert_from_path(filename, dpi=500, first_page=firstpage,
last_page=lastpage, userpw=userpw, poppler_path=popplerLoc, output_folder="./images/imageData")
# using return value of the get_path_of_source function
# filenameOfOutput = get_path_of_source(filename).with_suffix(".png")
totalImages = len(images)
for i, image in enumerate(images):
filename = "images/image" + str(i) + ".png"
image.save(filename, 'JPEG')
print("Converted to PNG...Saving to : {}".format(filename))
# this is sub-function of png_to_txt, it is below this function
def save_to_file_as_txt(filename, text):
filenamenew = get_path_of_source(filename).with_suffix('.txt')
print("Converted to TXT...Saving to : {}".format(filenamenew))
with open(filenamenew, 'w') as fout:
for entry in text:
fout.write(entry)
""" STEP 2 - Converting PNG to TXT using Tesseract-OCR """
def png_to_txt(tesseractLoc, filename):
# This is added so that python knows where the location of tesseract-OCR is
pytesseract.pytesseract.tesseract_cmd = tesseractLoc
# again using the function return value
sourceImg = get_path_of_source(filename)
# Using pillow to open image
img = Image.open(sourceImg)
filenameOfImg = img.filename
stringtext = str(filenameOfImg)
filenameOfImg = stringtext.replace("image", "text")
text = pytesseract.image_to_string(img)
# calling the function which was defined above this function
save_to_file_as_txt(filenameOfImg, text)
"""Step 3 - Converting TXT to CSV """
def txt_to_csv(filename):
fileToRead = open(filename)
x = fileToRead.readlines()
ConvertedfileAsList = []
for i in x:
# We remove commas to avoid confusion between the numbers and actual cell, eg. 12,000 is twelve thousand
# not 12 and 000
without_comma = i.replace(",", "")
# then we add commas to the text
with_our_added_commas = without_comma.replace("/n", ",")
# this is to replace inverted commas which were causing problem in excel
# as it thought every row was a single string
strings_without_inverted_commas = with_our_added_commas.replace("\"", "")
ConvertedfileAsList.append(strings_without_inverted_commas)
# Function to save the CSV
def save_as_csv(data, filename):
filename = get_path_of_source(filename).with_suffix('.csv')
filename = str(filename)
filename = filename.replace("text", "csv")
print("Converted to CSV...Saving to : {}".format(filename))
# to remove extra data
data.pop(0)
for i, innerData in enumerate(data):
data[i] = innerData.replace('\n', '')
newData = []
isFirst = True
isComplete = False
for i, innerData in enumerate(data):
if isFirst:
if innerData != "" and innerData != " ":
newData.append(innerData)
newData.append(",")
isFirst = False
# elif not isComplete:
# if innerData != "":
# newData.append(innerData)
# newData.append(",")
# isComplete = True
else:
if innerData != "" and innerData != " ":
newData.append(innerData)
newData.append("\n")
isComplete = False
isFirst = True
with open(filename, 'w') as fout:
for entry in newData:
fout.write(entry)
# Calling save function
save_as_csv(ConvertedfileAsList, filename)
# this makes sure that the functions get executed only when the .py is run as main file not as a module
# thereby making it useful for implementation in some other program
if __name__ == '__main__':
pdf_to_csv(args.input, args.firstpage, args.lastpage, args.password)