-
Notifications
You must be signed in to change notification settings - Fork 0
/
wp2markdown.py
75 lines (59 loc) · 2.52 KB
/
wp2markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from bs4 import BeautifulSoup
from html import unescape
import os, os.path
import sys
file="test.xml"
try:
file = sys.argv[1]
except:
print("No file found")
quit(__name__)
## I prefer formatting YYYY.MM.DD but if you want something besides a . replace it here
sep = "."
folder = "output_files"
def clean_text(post_text):
## Some HTML Tags and nonsense to strip out of every post. More can be added
replace_strings = ["<p>", "</p>", "<!-- wp:paragraph -->", "<!-- /wp:paragraph -->", "<!-- /wp:image -->",
'<!-- wp:image {"linkDestination":"custom"} -->', '">', " "]
for each in replace_strings:
post_text = post_text.replace(each, "")
### Make a vague attempt to convery links into Markdown Links
post_text = post_text.replace('<a href="', "[")
post_text = post_text.replace('</a>', "](Link)")
### I can't embed an image in a markdown file anyway, so when an image shows up, just remove it.
while "<figure" in post_text:
img_start = post_text.rfind("<figure")
img_end = post_text.rfind("</figure>")+9
post_text = post_text[0:img_start]+post_text[img_end:]
while "< !--" in post_text:
start = post_text.rfind("< !--")
end = post_text.rfind("-->")+3
post_text = post_text[0:start]+post_text[end:]
### There is a lot of white space going on, so we are going to strip that out.
while "\n\n\n" in post_text:
post_text = post_text.replace('\n\n\n', "\n\n")
return post_text
with open(file, "r",encoding='utf-8') as file_read:
posts = file_read.read()
soup = BeautifulSoup(posts, "xml")
#print(soup)
items = soup.findAll("item")
#print(len(items))
for each in items:
post_content_dirty = unescape(str(each("content:encoded"))[18:-19])
post_content = clean_text(post_content_dirty)
### We only care about posts, so we look for things with Post Content that aren't tags or images or whatever
if len(post_content) > 3:
## characters not allowed in file names
bad_chars = ["<",">",":",'"',"/","|","?","*"]
title = str(each("title"))[8:-9].split(" [ID")[0]
date_formatted = str(each("post_modified"))[19:-29].replace("-", sep)
post_content = f"## {title}\n\n{post_content}"
for char in bad_chars:
title = title.replace(char, "")
filename = f"{date_formatted} - {title}"
try:
with open("output_files/"+filename + '.md', 'w+') as output:
output.write(post_content)
except:
pass