-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchrome_bookmarks_parser.py
232 lines (194 loc) · 8.77 KB
/
chrome_bookmarks_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
""" Takes the path of chrome bookmarks export file (in HTML format) parse it,
then create a corresponding JSON file with all the links (name, url, and
attributes like add date or icon) and folders (name, add and last modified
date) following the nested folder structure.
Usage: chrome_bookmarks_parser.py <bookmarks_files>
Original Author: P1tr0w
Source: https://gist.github.com/P1tr0w/a505ad6c851f2b7f1c7d82799d2a5df0
"""
import re
import json
import operator
import os
import argparse
from functools import reduce
class BookmarkParser():
def __init__(self, outdir=".", output_indent=4, ensure_ascii=False,
encoding_input='utf8', encoding_output='utf8'):
self.encoding_input = encoding_input
self.encoding_output = encoding_output
self.output_indent = output_indent
self.ensure_ascii = ensure_ascii
self.outdir = outdir
def parse(self, infile):
# Read input file, lines to list, strip each line.
self._file_lines = []
with open(infile, encoding=self.encoding_input) as f:
self._file_lines = f.read().splitlines()
for i in range(len(self._file_lines)):
self._file_lines[i] = self._file_lines[i].strip()
# self.tree: dict with the whole structure
# root: key that holds all the structure in its value, it's used
# to have a root directory that is not the tree itself
self._tree = {'root': {}}
# list to store the path of the current directory through the
# iteration, used by _change_folder
self._path = ['root']
# reference to the tree used by _change_folder
self._folder = self._tree
self._change_folder()
self._iterate()
self._save_json(outfile=change_file_extension(infile, "json"))
self._save_to_clickable_link(outdir=self.outdir)
def _change_folder(self):
""" Change the folder reference to the last folder in path. """
self._folder = get_by_path(self._tree, self._path)
def _iterate(self):
""" Iterate HTML lines to parse structure from <DT H3 and </DL, and
data from <DT H3 and <DT A
"""
for line in self._file_lines:
# If the first tag is DT
if line[1:3] == 'DT':
# And if the second tag is A
if line[5] == 'A':
# Its a link: get its parameters and append to current
# folder
self._handle_link(line)
# If the second tag is H3
elif line[5:7] == 'H3':
# Its a folder: get its parameters, append to current
# folder, create this folder with meta subfolder, and set
# current folder to this
self._handle_folder(line)
# If tag is /DL we closed current folder: pop it from the path and
# change folder
elif line[1:4] == '/DL':
self._path.pop()
self._change_folder()
def _handle_link(self, line):
name = re.findall(r"<A.*?>(.*?)</A>", line)[-1]
href = re.search(r'(?<=HREF=").*?(?=")', line).group()
add_date = re.search(r'(?<=ADD_DATE=").*?(?=")', line).group()
# Check if the link has an icon
icon = ''
has_icon = re.search(r'(?<=ICON=").*?(?=")', line)
if has_icon:
icon = has_icon.group()
info = {
'url': href,
'add_date': add_date,
'icon': icon
}
self._folder.update({name: info})
def _handle_folder(self, line):
name = re.findall(r'(?<=>)[\w\s]*?(?=<)', line)[-1]
last_modified = re.search(
r'(?<=LAST_MODIFIED=").*?(?=")', line).group()
add_date = re.search(r'(?<=ADD_DATE=").*?(?=")', line).group()
info = {
'add_date': add_date,
'last_modified': last_modified
}
self._folder.update({name: {'meta': info}})
self._path.append(name)
self._change_folder()
def _save_json(self, outfile):
""" Save the tree dict to a json file. """
with open(outfile, 'w', encoding=self.encoding_output) as outfile:
json.dump(self._tree, outfile, indent=self.output_indent,
ensure_ascii=self.ensure_ascii)
def _save_to_clickable_link(self, outdir):
""" Outpu the JSON nested lists to the disk by creating directories and
clickable links.
TODO: The directories are not nested. But I gave up.
TODO: This function is a mess, but it seems to work.
"""
toclean = [self._tree["root"]]
toclean_names = ["root"]
os.makedirs(os.path.join(outdir, "root"), exist_ok=True)
while toclean:
current_dict = toclean.pop()
current_dict_name = toclean_names.pop()
for k, v in current_dict.items():
if k == "meta":
continue
if isinstance(v, dict):
# Check if this dict is a leaf
if "url" in v.keys():
# Preprocess filename to be compatible with a Windows
# filesystem, then create the bookmark
create_chrome_windows_bookmark(
filepath=os.path.join(outdir, current_dict_name,
get_valid_filename(k)),
url=v["url"]
)
else:
os.makedirs(os.path.join(outdir, k), exist_ok=True)
toclean.append(v)
toclean_names.append(k)
def change_file_extension(filename, new_extension):
""" Swap the extension of a file for a provided `new_extension`.
Example:
>>> change_file_extension("file.txt", "md")
"file.md"
"""
return os.path.splitext(filename)[0] + "." + new_extension
def get_by_path(dictionary, map_list):
""" Access a nested object in a dictionary by item sequence.
Source:
* https://stackoverflow.com/a/14692747/4406340
"""
return reduce(operator.getitem, map_list, dictionary)
def set_by_path(dictionary, map_list, value):
""" Set a value in a nested object in a dictionary by item sequence.
Source:
* https://stackoverflow.com/a/14692747/4406340
"""
get_by_path(dictionary, map_list[:-1])[map_list[-1]] = value
def get_valid_filename(s):
""" Return the given string converted to a string that can be used for a
clean filename. Remove leading and trailing spaces; convert other spaces to
underscores; and remove anything that is not an alphanumeric, dash,
underscore, or dot.
>>> get_valid_filename("john's portrait in 2004.jpg")
'johns_portrait_in_2004.jpg'
Source:
https://github.com/django/django/blob/master/django/utils/text.py#L219
"""
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
def create_chrome_windows_bookmark(filepath, url, extension=".url"):
with open(filepath + extension, 'w') as file:
file.write("[InternetShortcut]\nURL=" + url)
def main(*, bookmarks_files, encoding_input, encoding_output, ensure_ascii,
output_indent, outdir):
parser = BookmarkParser(encoding_input=encoding_input,
encoding_output=encoding_output,
ensure_ascii=ensure_ascii,
output_indent=output_indent,
outdir=outdir)
for bookmarks_file in bookmarks_files:
parser.parse(bookmarks_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('bookmarks_files', default='utf8', type=str, nargs="+",
help='Chrome HTML bookmarks files')
parser.add_argument('--encoding-input', default='utf8', type=str,
help='Input files encoding')
parser.add_argument('--encoding-output', default='utf8', type=str,
help='Output files encoding')
parser.add_argument('--output-indent', default=4, type=int, metavar='N',
help='Output indentation')
parser.add_argument('--outdir', default='chrome_bookmarks', type=str,
help='Output dir for clickable links')
parser.add_argument('--ensure-ascii', dest='ensure_ascii',
action='store_true',
help='Escape non-ASCII characters in output files')
args = parser.parse_args()
main(bookmarks_files=args.bookmarks_files,
encoding_input=args.encoding_input,
encoding_output=args.encoding_output,
ensure_ascii=args.ensure_ascii,
output_indent=args.output_indent,
outdir=args.outdir)