-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_1_linux.py
28 lines (21 loc) · 1016 Bytes
/
2_1_linux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# https://www.youtube.com/watch?v=Iu-HLy9hICg, 23:30
# normalizacja danych - konwersja do formatu 'txt'
import os
import subprocess
dir = os.getcwd() + "/Data/"
for fileName in os.listdir("Data"):
destFile = dir + fileName + ".txt"
fileName = dir + fileName
if fileName.endswith('rtf'):
cmd = r'unrtf --nopict --noremap --text "{}" > "{}"'.format(fileName, destFile)
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
if fileName.endswith('doc'):
cmd = r'timelimit -t5 -T3 catdoc "{}" > "{}"'.format(fileName, destFile)
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
if fileName.endswith('xls'):
cmd = r'timelimit -t5 -T3 xls2csv "{}" > "{}"'.format(fileName, destFile)
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
if fileName.endswith('pdf'):
cmd = r'pdftotext -layout -nopgbrk "{}" "{}"'.format(fileName, destFile)
subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
# podobny sposob mona sobie dopisac obsluge innych formatow, np. Office 2007+, html itd.