From b76cbe3730d76c956ee57bb2867914d931e75928 Mon Sep 17 00:00:00 2001 From: Dave Brondsema Date: Fri, 9 Nov 2012 13:15:18 -0500 Subject: [PATCH 1/3] escape &<> so that entities don't disappear during conversion --- html2text.py | 12 +++++++++--- test/normal.html | 5 +++++ test/normal.md | 3 +++ test/normal_escape_snob.html | 7 ++++++- test/normal_escape_snob.md | 3 +++ 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/html2text.py b/html2text.py index 2f650ab5..8a795142 100755 --- a/html2text.py +++ b/html2text.py @@ -30,7 +30,7 @@ def has_key(x, y): import urllib.request as urllib except: import urllib -import optparse, re, sys, codecs, types +import optparse, re, sys, codecs, types, cgi try: from textwrap import wrap except: pass @@ -272,10 +272,16 @@ def close(self): return self.outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) diff --git a/test/normal.html b/test/normal.html index 47ef480e..74c94b7e 100644 --- a/test/normal.html +++ b/test/normal.html @@ -136,5 +136,10 @@

c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#

+ +

+ A common entity is &copy;
+ 3 < 6 && "z" > "a" +

diff --git a/test/normal.md b/test/normal.md index d63b403a..7701303d 100644 --- a/test/normal.md +++ b/test/normal.md @@ -52,3 +52,6 @@ not a hr c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# +A common entity is &copy; +3 < 6 && "z" > "a" + diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html index 0d21867a..da90792b 100644 --- a/test/normal_escape_snob.html +++ b/test/normal_escape_snob.html @@ -133,9 +133,14 @@


- - -

- +

c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#

+ +

+ A common entity is &copy;
+ 3 < 6 && "z" > "a" +

diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md index 1260a0a8..fe4c7c9d 100644 --- a/test/normal_escape_snob.md +++ b/test/normal_escape_snob.md @@ -53,3 +53,6 @@ not a hr c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\# +A common entity is &copy; +3 < 6 && "z" > "a" + From 08e016887ea495234fa9db82dbad881892d82059 Mon Sep 17 00:00:00 2001 From: Dave Brondsema Date: Fri, 9 Nov 2012 15:08:17 -0500 Subject: [PATCH 2/3] set code flag properly so that escaping is not done within `backticks` --- html2text.py | 5 ++++- test/normal.html | 2 ++ test/normal.md | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index 8a795142..932f4513 100755 --- a/html2text.py +++ b/html2text.py @@ -459,7 +459,10 @@ def handle_tag(self, tag, attrs, start): # handle some font attributes, but leave headers clean self.handle_emphasis(start, tag_style, parent_style) - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` + if tag in ["code", "tt"] and not self.pre: + # TODO: `` `this` `` + self.o('`') + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None diff --git a/test/normal.html b/test/normal.html index 74c94b7e..8156a57c 100644 --- a/test/normal.html +++ b/test/normal.html @@ -137,6 +137,8 @@

c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#

+ c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +

A common entity is &copy;
3 < 6 && "z" > "a" diff --git a/test/normal.md b/test/normal.md index 7701303d..6d52b662 100644 --- a/test/normal.md +++ b/test/normal.md @@ -52,6 +52,8 @@ not a hr c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# +`c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#` + A common entity is &copy; 3 < 6 && "z" > "a" From d7c33ed99446a046dc5cc3aaefc89d1e9ffa73a1 Mon Sep 17 00:00:00 2001 From: Dave Brondsema Date: Fri, 9 Nov 2012 15:09:13 -0500 Subject: [PATCH 3/3] preserve   entities This allows multiple sequential   entities to still be multiple spaces, rather than getting collapsed. Within `code` blocks, neither a literal space nor a   work, so a unicode nbsp char is used which seems to work in many markdown renderers. This fixes the output of the google doc code section. --- html2text.py | 5 ++++- test/GoogleDocMassDownload.md | 10 +++++----- test/GoogleDocSaved.md | 10 +++++----- test/nbsp.html | 3 +-- test/nbsp.md | 18 +++++++++--------- test/normal.html | 9 +++++++++ test/normal.md | 7 +++++++ test/run_tests.py | 2 +- 8 files changed, 41 insertions(+), 23 deletions(-) diff --git a/html2text.py b/html2text.py index 932f4513..29fea4d7 100755 --- a/html2text.py +++ b/html2text.py @@ -266,7 +266,7 @@ def close(self): if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: - nbsp = u' ' + nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext @@ -281,6 +281,9 @@ def handle_entityref(self, c): entityref = self.entityref(c) if not self.code and not self.pre and entityref != ' _place_holder;': entityref = cgi.escape(entityref) + if (self.code or self.pre) and entityref == ' _place_holder;': + #   doesn't work in `` and indented blocks + entityref = unichr(name2cp('nbsp')) self.o(entityref, 1) def handle_starttag(self, tag, attrs): diff --git a/test/GoogleDocMassDownload.md b/test/GoogleDocMassDownload.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocMassDownload.md +++ b/test/GoogleDocMassDownload.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human**  being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +`   if x < 1:` +`     return 'a'` +`   return 'b'` -Some ` fixed width text` here +Some ` fixed width text`  here _` italic fixed width text`_ diff --git a/test/GoogleDocSaved.md b/test/GoogleDocSaved.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocSaved.md +++ b/test/GoogleDocSaved.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human**  being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +`   if x < 1:` +`     return 'a'` +`   return 'b'` -Some ` fixed width text` here +Some ` fixed width text`  here _` italic fixed width text`_ diff --git a/test/nbsp.html b/test/nbsp.html index 9cab9015..356d9712 100644 --- a/test/nbsp.html +++ b/test/nbsp.html @@ -5,7 +5,7 @@

NBSP handling test #1

-

In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = False).

+

In this test all NBSP entities will be preserved (unicode_snob = False).

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, @@ -17,4 +17,3 @@

NBSP handling test #1

proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

- diff --git a/test/nbsp.md b/test/nbsp.md index 16c9ee38..716fe5d7 100644 --- a/test/nbsp.md +++ b/test/nbsp.md @@ -1,14 +1,14 @@ # NBSP handling test #1 -In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = -False). +In this test all NBSP entities will be preserved (unicode_snob = False). -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod -tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, -quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo -consequat. +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim +ad minim veniam, quis nostrud exercitation ullamco laboris nisi +ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore -eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt -in culpa qui officia deserunt mollit anim id est laborum. +Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat +non proident, sunt in culpa qui officia deserunt mollit anim id est +laborum. diff --git a/test/normal.html b/test/normal.html index 8156a57c..b8a472ec 100644 --- a/test/normal.html +++ b/test/normal.html @@ -143,5 +143,14 @@

A common entity is &copy;
3 < 6 && "z" > "a"

+ +

+ foo   bar +

+ +
foo   bar
+ + foo   bar + diff --git a/test/normal.md b/test/normal.md index 6d52b662..503f71ba 100644 --- a/test/normal.md +++ b/test/normal.md @@ -57,3 +57,10 @@ c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# A common entity is &copy; 3 < 6 && "z" > "a" +foo   bar + + + foo   bar + +`foo   bar` + diff --git a/test/run_tests.py b/test/run_tests.py index 7ebfd394..1cecc4df 100644 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -43,7 +43,7 @@ def test_command(fn, *args): cmd += [fn] result = get_baseline(fn) - actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read() + actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read().decode('utf-8') if os.name == 'nt': # Fix the unwanted CR to CRCRLF replacement