-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
164 lines (99 loc) · 3.21 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
# -*- coding: utf8 -*- #https://www.python.org/dev/peps/pep-0263/
import os
import sys
import io, csv, argparse
from bs4 import BeautifulSoup
import math
# Parameters
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file',
required=True,
type=str,
default=False,
dest="file",
metavar="<xml file parser>",
help="please use this command first 'pdftohtml -xml -i file.pdf' if you haven't used yet or use python main.py filename.xml > /path/to/destination.csv")
args = parser.parse_args()
# collect and store data in "data"
soup = BeautifulSoup(open(args.file), "lxml-xml")
f=open("/home/nikhil/Desktop/hell","w")
str1=str(soup)
f.write(str1)
data = {}
for page in soup.find_all('page'):
pgnb = int(page.get('number'))
pgheight = int(page.get('height'))
pgwidth = int(page.get('width'))
data[pgnb] = {}
for text in page.find_all('text'):
top = int(text.get('top'))
if data[pgnb].has_key(top) is False: #instead of 'has_key' we can use "in"
data[pgnb][top] = {}
left = int(text.get('left'))
data[pgnb][top][left] = text.get_text()
# Expected number of cells per line
numbcells = 0
for page in data:
for line in data[page]:
cells = len(data[page][line])
if numbcells < cells:
numbcells = cells
# Recover the approximate positions of the columns
pos_of_cols = {}
for page in data:
for line in data[page]:
for cell in data[page][line]:
if pos_of_cols.has_key(cell) is False:
pos_of_cols[cell] = 1
else:
pos_of_cols[cell] += 1
cols = []
# Frequently we take by limiting the number of cells expected
for cell in sorted(pos_of_cols, key=pos_of_cols.get, reverse=True):
if len(cols) < numbcells:
cols.append(cell)
cols.sort()
# Create mini intervals / max possible positions
margin = 10
ranges = {}
for k, col in enumerate(cols):
mini = col-margin
if k == 0:
mini = 0
try:
maxi = cols[k+1]-margin-1
except IndexError:
maxi = col*2
ranges[col] = {'mini': mini, 'maxi': maxi}
# sort and view data
sorted_data = sorted(data)
for page in sorted_data:
sorted_page = sorted(data[page])
for line in sorted_page:
sorted_line = sorted(data[page][line])
nb = len(sorted_line)
row = {}
# If fewer cells than expected
if nb < numbcells:
# It creates an empty structure with as many columns as expected
for r in ranges:
row[r] = ''
# Is filled with the values according to their positions
for cell in sorted_line:
ok = False
for r in ranges:
if cell >= ranges[r]['mini'] and cell <= ranges[r]['maxi']:
row[r] = data[page][line][cell]
ok = True
if ok is False:
print(cell)
print("Not found in ranges")
else:
for cell in sorted_line:
row[cell] = data[page][line][cell]
output = io.BytesIO()
print output
writer = csv.DictWriter(output, fieldnames=sorted(row.keys()), quoting=csv.QUOTE_NONNUMERIC)
writer.writerow({k:v.encode('utf8') for k,v in row.items()})
print(output.getvalue().strip())