diff --git a/.travis.yml b/.travis.yml
index 1269a55..2105946 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,11 @@ branches:
   only:
     - master
 
+addons:
+  apt:
+    packages:
+      - libarchive13
+
 install:
   - sqlite3 --version
   - pip install tox
diff --git a/README.md b/README.md
index 664cdfc..7c9774b 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,22 @@ Le module `normalize` corrige les titres de textes qui ne sont pas parfaitement
 La "factorisation" connecte entre elles les différentes version d'un même texte.
 La base LEGI n'a pas d'identifiant qui remplisse réellement ce rôle.
 
+### Nettoyage des contenus
+
+Le module `html` permet de nettoyer les contenus des textes. Il supprime :
+
+- les espaces redondantes (*whitespace collapse*), sauf à l'intérieur des `<pre>`
+- les attributs inutiles, par exemple `id` et `dir="ltr"`
+- les éléments inutiles, par exemple un `<span>` sans attributs
+- les éléments vides, sauf `<td>` et `<th>`
+
+En février 2018 il détecte 78 millions de caractères inutiles dans LEGI.
+
+Cette fonctionnalité n'est pas activée par défaut car elle est « destructrice »
+et récente. Vous pouvez nettoyer tout l'HTML d'une base en exécutant la commande
+`python -m legi.html clean legi.sqlite` (les modifications ne sont enregistrées
+que si vous entrez `y` à la fin).
+
 ### Détection d'anomalies
 
 Le module `anomalies` est conçu pour détecter les incohérences dans les données afin de les signaler à la DILA. Le résultat est visible sur [anomalies.legilibre.fr][anomalies]. (`cron/anomalies-cron.sh` est le script qui génère ce mini-site.)
diff --git a/legi/html.py b/legi/html.py
new file mode 100644
index 0000000..e86ce88
--- /dev/null
+++ b/legi/html.py
@@ -0,0 +1,512 @@
+# encoding: utf8
+
+"""
+This module handles the HTML provided in LEGI.
+"""
+
+from __future__ import division, print_function, unicode_literals
+
+from argparse import ArgumentParser
+from collections import namedtuple
+from difflib import ndiff
+import json
+import re
+from xml.parsers import expat
+
+from lxml import etree
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    print('[warning] tqdm is not installed, the progress bar is disabled')
+    tqdm = lambda x: x
+
+from .utils import connect_db, group_by_2, input, ascii_spaces_re
+
+
+# An immutable type representing the opening of an HTML element
+StartTag = namedtuple('StartTag', 'tag void style dropped parent')
+
+# String of ascii whitespace
+ASCII_SPACES = ' \t\n\r\f\v'
+
+# Set of HTML block tags
+# https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+BLOCK_ELEMENTS = set('''
+    address article aside blockquote canvas dd div dl dt fieldset figcaption
+    figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr li main nav noscript
+    ol output p pre section table tfoot ul video
+'''.split())
+
+# Set of HTML tags around which we want to trim whitespace
+TRIM_AROUND_ELEMENTS = BLOCK_ELEMENTS | set('body br td th tr'.split())
+
+# Map of color names to hexadecimal values
+COLORS_MAP = {
+    'black': '#000000',
+    'white': '#ffffff',
+}
+
+# Default styles, used to detect redundant attributes
+DEFAULT_STYLE = {
+    '.collapse-spaces': True,
+    'align': 'left',
+    'bgcolor': '#ffffff',
+    'clear': 'none',
+    'color': '#000000',
+    'dir': 'ltr',
+    'size': '3',  # https://developer.mozilla.org/docs/Web/HTML/Element/font
+    'valign': 'baseline',
+}
+
+# Set of elements that should not be dropped even if they're completely empty
+KEEP_EMPTY = {'body', 'br', 'hr', 'td', 'th'}
+
+# A fake StartTag which holds the default styles
+INVISIBLE_ROOT_TAG = StartTag(None, None, DEFAULT_STYLE, True, None)
+
+# Set of attributes that should always be dropped
+USELESS_ATTRIBUTES = {'charoff', 'face', 'id'}
+
+# Set of elements that should be dropped if they don't have any attributes
+USELESS_WITHOUT_ATTRIBUTES = {'font', 'span'}
+
+# http://w3c.github.io/html/syntax.html#void-elements
+# Only two void tags are actually used in LEGI
+VOID_ELEMENTS = {'br', 'hr'}
+
+
+ESCAPE_TABLE = [('&', '&amp;'), ('<', '&lt;'), ('>', '&gt;')]
+ESCAPE_ATTR_TABLE = ESCAPE_TABLE + [('"', '&#34;')]
+
+
+def escape(s, table=ESCAPE_TABLE):
+    """Escape &, <, and > in a string of data.
+    """
+    for c, r in table:
+        if c in s:
+            s = s.replace(c, r)
+    return s
+
+
+def unescape(s):
+    """Unescape &amp;, &lt;, and &gt; in a string of data.
+    """
+    if '&' not in s:
+        return s
+    return s.replace('&gt;', '>').replace('&lt;', '<').replace('&amp;', '&')
+
+
+def quoteattr(s, table=ESCAPE_ATTR_TABLE):
+    """Escape and quote an attribute value.
+    """
+    for c, r in table:
+        if c in s:
+            s = s.replace(c, r)
+    return '"%s"' % s
+
+
+bad_space_re = re.compile(r"[dl]['’] \w| [,.]", re.I | re.U)
+
+
+def drop_bad_space(m):
+    return m.group(0).replace(' ', '')
+
+
+def is_start_of(s, tag):
+    x = tag.__len__()
+    return s[0] == '<' and s[1:x+1] == tag and s[x+1] in ' >' and s[-2] != '/'
+
+
+class HTMLCleaner(object):
+    """A parser target which returns cleaned HTML (as a string, not a tree).
+
+    Doc: http://lxml.de/parsing.html#the-target-parser-interface
+    """
+
+    def __init__(self):
+        self.at_segment_start = True
+        self.drop_line_breaks = True
+        self.last_trimmable_node = None
+        self.out = []
+        self.current_tag = INVISIBLE_ROOT_TAG
+        self.text_chunks = []
+
+    def start(self, tag, attrs):
+        # Add start tag to stack and output
+        void = tag in VOID_ELEMENTS
+        attrs_str = ''
+        parent = self.current_tag
+        parent_styles = parent.style
+        new_styles = {}
+        if attrs:
+            is_list_tag = tag in {'ul', 'ol'}
+            for k, v in group_by_2(attrs):
+                # Skip useless attributes
+                if k in USELESS_ATTRIBUTES:
+                    continue
+                # Skip obsolete list style attribute
+                if is_list_tag and k == 'type':
+                    continue
+                # Normalize the value
+                v = v.strip()
+                if k[-5:] == 'color':
+                    v = v.lower()
+                    if v[:4] == 'rgb(':
+                        v = '#%02x%02x%02x' % tuple(int(s.strip()) for s in v[4:-1].split(','))
+                    elif v.__len__() == 6 and v.isdigit():
+                        v = '#' + v
+                    else:
+                        v = COLORS_MAP.get(v, v)
+                # Skip redundant styles
+                parent_style = parent_styles.get(k)
+                if parent_style == v:
+                    continue
+                if parent_style:
+                    if k == 'size':
+                        size = int(v)
+                        # Skip 0 (invalid) and 4 through 7 (enlarged text)
+                        if size == 0 or size > 3:
+                            continue
+                    new_styles[k] = v
+                # Add to output
+                attrs_str += ' %s=%s' % (k, quoteattr(v))
+        if tag == 'pre':
+            new_styles['.collapse-spaces'] = False
+        styles = dict(parent_styles, **new_styles) if new_styles else parent_styles
+        if self.drop_line_breaks and ''.join(self.text_chunks).strip(ASCII_SPACES):
+            self.drop_line_breaks = False
+        dropped = (
+            not attrs_str and tag in USELESS_WITHOUT_ATTRIBUTES or
+            tag == 'br' and self.drop_line_breaks
+        )
+        start_tag = StartTag(tag, void, styles, dropped, parent)
+        if not dropped:
+            # Process queued text chunks
+            if self.text_chunks:
+                self.handle_text(next_tag=start_tag)
+            # Add start tag to output
+            self.out.append('<' + tag + attrs_str + ('/>' if void else '>'))
+        self.current_tag = start_tag
+
+    def end(self, tag):
+        start_tag = self.current_tag
+        # Don't add an end tag if the start tag was self-closed or skipped
+        if start_tag.void or start_tag.dropped:
+            self.current_tag = start_tag.parent
+            return
+        # Clean up empty elements
+        collapsed = False
+        if is_start_of(self.out[-1], tag):
+            if not ''.join(self.text_chunks).strip(ASCII_SPACES):
+                tag_has_attributes = self.out[-1].__len__() > tag.__len__() + 2
+                if tag_has_attributes or tag in KEEP_EMPTY:
+                    # Drop the whitespace chunks, if any
+                    self.text_chunks = []
+                    # Collapse the element
+                    self.out[-1] = self.out[-1][:-1] + '/>'
+                    collapsed = True
+                else:
+                    # Drop the element entirely
+                    self.out.pop()
+                    if self.out and self.out[-1][0] != '<':
+                        # Previous output element was a text node, put it back
+                        # in the chunks queue
+                        self.text_chunks.insert(0, unescape(self.out.pop()))
+                        # Reset last_trimmable_node (we don't need to restore
+                        # its previous value)
+                        self.last_trimmable_node = None
+                    self.current_tag = start_tag.parent
+                    return
+        # Process queued text chunks
+        if self.text_chunks:
+            self.handle_text()
+        # Handle whitespace collapsing
+        if tag in TRIM_AROUND_ELEMENTS:
+            # Drop tail space
+            if self.last_trimmable_node:
+                i, self.last_trimmable_node = self.last_trimmable_node, None
+                self.out[i] = self.out[i][:-1]
+            # Enable dropping the next space
+            self.at_segment_start = True
+        # Update current_tag
+        self.current_tag = start_tag.parent
+        # Add end tag to output
+        if not collapsed:
+            self.out.append('</%s>' % tag)
+
+    def data(self, text):
+        # We can't always get a single string for a text node, so we store
+        # chunks in a list and assemble them when we're ready
+        self.text_chunks.append(text)
+
+    def handle_text(self, next_tag=None):
+        text = ''.join(self.text_chunks)
+        self.text_chunks = []
+        if not text:
+            return
+        # Collapse spaces, unless we're inside a <pre>
+        # https://www.w3.org/TR/css-text-3/#white-space-processing
+        if self.current_tag.style['.collapse-spaces']:
+            text = ascii_spaces_re.sub(' ', text)
+            # Handle spaces around closing tags
+            i = self.last_trimmable_node
+            if i and not next_tag and self.out[i - 1][:2] == '</':
+                # `</i> foo </b>bar` → `</i> foo</b> bar`
+                trimmed = self.out[i][:-1]
+                if trimmed:
+                    self.out[i] = trimmed
+                else:
+                    self.out.pop(i)
+                i = self.last_trimmable_node = None
+                if text[0] != ' ':
+                    text = ' ' + text
+            # Drop leading space if the previous text node has a trailing space
+            # or if we're at the beginning of a "segment"
+            if text[0] == ' ' and (self.last_trimmable_node or self.at_segment_start):
+                text = text[1:]
+                if not text:
+                    return
+            # French-specific dropping of bad spaces, e.g. "l' article" → "l'article"
+            text = bad_space_re.sub(drop_bad_space, text)
+            # Are we about to open a new non-inline element?
+            if next_tag and next_tag.tag in TRIM_AROUND_ELEMENTS:
+                self.at_segment_start = True
+                # Drop tail space
+                if text[-1] == ' ':
+                    text = text[:-1]
+                    if not text:
+                        return
+            else:
+                self.at_segment_start = False
+            # Does the trimmable text node we're adding have a tail space?
+            if text[-1] == ' ':
+                self.last_trimmable_node = self.out.__len__()
+            else:
+                self.last_trimmable_node = None
+        else:
+            self.last_trimmable_node = None
+        # Stop dropping <br> tags
+        if self.drop_line_breaks:
+            self.drop_line_breaks = False
+        # Add to output
+        self.out.append(escape(text))
+
+    def close(self):
+        if self.text_chunks:
+            self.out.append(''.join(self.text_chunks).rstrip(ASCII_SPACES))
+        # Join the output into a single string, then reset the parser before
+        # returning so that it can be reused
+        r = ''.join(self.out)
+        self.__init__()
+        return r
+
+
+def clean_html(html, cleaner=HTMLCleaner()):
+    """Returns cleaned HTML
+
+    Warning: this function is not thread safe unless you provide your own
+    thread-local `cleaner` instance.
+    """
+    p = expat.ParserCreate()
+    p.buffer_text = True
+    p.ordered_attributes = True
+    p.StartElementHandler = cleaner.start
+    p.EndElementHandler = cleaner.end
+    p.CharacterDataHandler = cleaner.data
+    p.Parse('<body>')
+    p.Parse(html)
+    p.Parse('</body>', 1)
+    return cleaner.close()[6:-7]
+
+
+strip_re = re.compile(r"<.+?>|[ \t\n\r\f\v]+", re.S)
+
+
+def clean_all_html_in_db(db, check=True):
+    stats = {'cleaned': 0, 'delta': 0, 'total': 0}
+
+    def clean_row(table, row):
+        row_id = row.pop('id')
+        update = {}
+        for col, html in row.items():
+            stats['total'] += 1
+            if not html:
+                continue
+            html_c = clean_html(html)
+            if html_c == html:
+                continue
+            update[col] = html_c
+            stats['cleaned'] += 1
+            delta = html_c.__len__() - html.__len__()
+            stats['delta'] += delta
+            if not check:
+                continue
+            # Check lengths
+            if delta > 0:
+                print()
+                print("=" * 70)
+                print((
+                    "Warning: cleaning column '%s' of row '%s' increased the "
+                    "length from %i to %i. Diff:"
+                ) % (col, row_id, len(html), len(html_c)))
+                print(diff_html(html, html_c))
+            # Check that no meaningfull text content was lost
+            html_s, html_c_s = strip_re.sub('', html), strip_re.sub('', html_c)
+            if html_s != html_c_s:
+                print()
+                print("=" * 70)
+                print("Cleaning column '%s' of row '%s' resulted in content loss. Diff:" %
+                      (col, row_id))
+                print(*ndiff([html_s], [html_c_s], None, None), sep='\n')
+            # Check that cleaning a second time does not alter the result
+            try:
+                html_c_2 = clean_html(html_c)
+            except Exception:
+                print()
+                print("Cleaning a second time failed for column '%s' of row '%s'. Diff:" %
+                      (col, row_id))
+                print(diff_html(html, html_c))
+                raise
+            if html_c_2 != html_c:
+                print()
+                print("=" * 70)
+                print("Inconsistent output for column '%s' of row '%s'." % (col, row_id))
+                print("*" * 5, "Original data:", "*" * 5)
+                print(html)
+                print("*" * 5, "Second run diff:", "*" * 5)
+                print(diff_html(html_c, html_c_2))
+        if update:
+            db.update(table, dict(id=row_id), update)
+
+    # Articles
+    print("Cleaning articles...")
+    q = db.all("SELECT id, bloc_textuel, nota FROM articles", to_dict=True)
+    for row in tqdm(q):
+        clean_row('articles', row)
+    # Textes
+    print("Cleaning textes_versions...")
+    q = db.all("""
+        SELECT id, visas, signataires, tp, nota, abro, rect
+          FROM textes_versions
+    """, to_dict=True)
+    for row in tqdm(q):
+        clean_row('textes_versions', row)
+
+    # Print stats
+    print("Done.")
+    print("Cleaned %(cleaned)i HTML fragments, out of %(total)i. Char delta = %(delta)i." % stats)
+
+
+def split_html_into_lines(html):
+    """Splits an HTML document into lines based on element boundaries.
+    """
+    tags = '(?:%s)' % ('|'.join(TRIM_AROUND_ELEMENTS))
+    html_split_re = re.compile(r"</{0}>".format(tags), re.I)
+    return html_split_re.sub('\\0\n', html.replace('\n', '\\n')).split('\n')
+
+
+def diff_html(html_a, html_b):
+    """Diff two HTML documents.
+    """
+    a, b = split_html_into_lines(html_a), split_html_into_lines(html_b)
+    return '\n'.join(ndiff(a, b, None, None))
+
+
+class StatsCollector(object):
+    """Collects stats about the HTML tags and attributes used in LEGI
+    """
+
+    def __init__(self):
+        self.stats = {}
+
+    def start(self, tag, attrs):
+        try:
+            tag_stats = self.stats[tag]
+        except KeyError:
+            tag_stats = self.stats[tag] = {'count': 0, 'attrs': {}}
+        tag_stats['count'] += 1
+        tag_stats_attrs = tag_stats['attrs']
+        for attr in attrs.items():
+            if attr[0] == 'id':
+                attr = attr[0]
+            elif attr[1].lstrip('-').isdigit():
+                attr = "%s = <integer>" % attr[0]
+            elif attr[1][-1:] == '%' and attr[1][:-1].isdigit():
+                attr = "%s = <percentage>" % attr[0]
+            else:
+                attr = "%s = %s" % attr
+            try:
+                tag_stats_attrs[attr] += 1
+            except KeyError:
+                tag_stats_attrs[attr] = 1
+
+    def comment(self):
+        self.start('<!--', ())
+
+    def close(self):
+        r = self.stats
+        self.__init__()
+        return r
+
+
+def analyze(db):
+    parser = etree.XMLParser(target=StatsCollector())
+    parser.feed('<root>')
+    # Articles
+    q = db.all("""
+        SELECT id, bloc_textuel, nota
+          FROM articles
+    """)
+    for article_id, bloc_textuel, nota in q:
+        if bloc_textuel:
+            parser.feed(bloc_textuel)
+        if nota:
+            parser.feed(nota)
+    # Textes
+    q = db.all("""
+        SELECT id, visas, signataires, tp, nota, abro, rect
+          FROM textes_versions
+    """)
+    for row in q:
+        for text in row[1:]:
+            if text:
+                parser.feed(text)
+    # Result
+    parser.feed('</root>')
+    stats = parser.close()
+    if stats['root']['count'] == 1:
+        del stats['root']
+    else:
+        stats['root']['count'] -= 1
+    print(json.dumps(stats, indent=4, sort_keys=True))
+
+
+if __name__ == '__main__':
+    p = ArgumentParser()
+    p.add_argument('command', choices=['analyze', 'clean'])
+    p.add_argument('db')
+    p.add_argument('--font-size', default='keep-small', choices=['drop', 'keep-small', 'preserve'],
+                   help="what to do with the `size` attribute of `font` elements")
+    p.add_argument('--skip-checks', default=False, action='store_true',
+                   help="skips checking the result of HTML cleaning")
+    args = p.parse_args()
+
+    if args.font_size == 'drop':
+        USELESS_ATTRIBUTES.add('size')
+    elif args.font_size == 'preserve':
+        DEFAULT_STYLE.pop('size')
+
+    db = connect_db(args.db)
+    try:
+        with db:
+            if args.command == 'analyze':
+                analyze(db)
+            elif args.command == 'clean':
+                clean_all_html_in_db(db, check=(not args.skip_checks))
+                save = input('Save changes? (y/N) ')
+                if save.lower() != 'y':
+                    raise KeyboardInterrupt
+                db.insert('db_meta', dict(key='raw', value=False), replace=True)
+    except KeyboardInterrupt:
+        pass
diff --git a/legi/tar2sqlite.py b/legi/tar2sqlite.py
index 9f3a5e3..d2e6f92 100755
--- a/legi/tar2sqlite.py
+++ b/legi/tar2sqlite.py
@@ -33,8 +33,8 @@ def count(d, k, c):
 
 
 def innerHTML(e):
-    i = len(e.tag) + 2
-    return etree.tostring(e, encoding='unicode', with_tail=False)[i:-i-1]
+    r = etree.tostring(e, encoding='unicode', with_tail=False)
+    return r[r.find('>')+1:-len(e.tag)-3]
 
 
 def scrape_tags(attrs, root, wanted_tags, unwrap=False):
@@ -476,18 +476,30 @@ def main():
     if not os.path.isdir(args.anomalies_dir):
         os.mkdir(args.anomalies_dir)
 
-    db = connect_db(args.db)
-    for pragma in args.pragma:
-        query = "PRAGMA " + pragma
-        result = db.one(query)
-        print("> Sent `%s` to SQLite, got `%s` as result" % (query, result))
+    db = connect_db(args.db, pragmas=args.pragma)
+    last_update = db.one("SELECT value FROM db_meta WHERE key = 'last_update'")
 
-    process_links = not args.skip_links
-    if args.skip_links:
+    # Check and record the data mode
+    db_meta_raw = db.one("SELECT value FROM db_meta WHERE key = 'raw'")
+    if args.raw:
+        versions_brutes = db.one("SELECT count(*) FROM textes_versions_brutes")
+        data_is_not_raw = versions_brutes > 0 or db_meta_raw is False
+        if data_is_not_raw:
+            print("!> Can't honor --raw option, the data has already been modified previously.")
+            raise SystemExit(1)
+    if db_meta_raw != args.raw:
+        db.insert('db_meta', dict(key='raw', value=args.raw))
+
+    # Handle the --skip-links option
+    links_count = db.one("SELECT count(*) FROM liens")
+    if not args.skip_links and links_count == 0 and last_update is not None:
+        args.skip_links = True
+        print("> Warning: links will not be processed because this DB was built with --skip-links.")
+    elif args.skip_links and links_count > 0:
+        print("> Deleting links...")
         db.run("DELETE FROM liens")
 
     # Look for new archives in the given directory
-    last_update = db.one("SELECT value FROM db_meta WHERE key = 'last_update'")
     print("> last_update is", last_update)
     archive_re = re.compile(r'(.+_)?legi(?P<global>_global)?_(?P<date>[0-9]{8}-[0-9]{6})\..+')
     skipped = 0
@@ -512,7 +524,7 @@ def main():
             skipped = 0
         print("> Processing %s..." % archive_name)
         with db:
-            process_archive(db, args.directory + '/' + archive_name, process_links)
+            process_archive(db, args.directory + '/' + archive_name, not args.skip_links)
             if last_update:
                 db.run("UPDATE db_meta SET value = ? WHERE key = 'last_update'", (archive_date,))
             else:
diff --git a/legi/utils.py b/legi/utils.py
index bec2330..75c70ec 100644
--- a/legi/utils.py
+++ b/legi/utils.py
@@ -18,6 +18,8 @@
 from unicodedata import combining, normalize
 
 
+PY2 = str is bytes
+
 input = getattr(builtins, 'raw_input', input)
 
 
@@ -61,7 +63,7 @@ def namedtuple_factory(cursor, row):
 }
 
 
-def connect_db(address, row_factory=None, create_schema=True, update_schema=True):
+def connect_db(address, row_factory=None, create_schema=True, update_schema=True, pragmas=()):
     db = DB(address)
     db.address = address
     if row_factory:
@@ -103,6 +105,11 @@ def one(*args, **kw):
         if r == '!RECREATE!':
             return connect_db(address, row_factory=row_factory, create_schema=True)
 
+    for pragma in pragmas:
+        query = "PRAGMA " + pragma
+        result = db.one(query)
+        print("> Sent `%s` to SQLite, got `%s` as result" % (query, result))
+
     return db
 
 
@@ -185,6 +192,18 @@ def run_migrations(db):
     return n - v
 
 
+def group_by_2(iterable):
+    iterable = iterable.__iter__()
+    next = iterable.next if PY2 else iterable.__next__
+    while True:
+        a = next()
+        try:
+            b = next()
+        except StopIteration:
+            raise ValueError("iterable returned an odd number of items")
+        yield (a, b)
+
+
 nonalphanum_re = re.compile(r'[^a-z0-9]')
 
 
@@ -220,6 +239,7 @@ def reconstruct_path(dossier, cid, sous_dossier, id):
     return '/'.join((prefix, dossier, id_to_path(cid), sous_dossier, id+'.xml'))
 
 
+ascii_spaces_re = re.compile(r'[ \t\n\r\f\v]+')
 nonword_re = re.compile(r'\W', re.U)
 spaces_re = re.compile(r'\s+', re.U)
 word_re = re.compile(r'\w{2,}', re.U)
diff --git a/requirements.txt b/requirements.txt
index 106c649..fe0512d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 libarchive-c
 lxml
-tqdm
\ No newline at end of file
+tqdm
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..f1252e1
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,104 @@
+# coding: utf8
+from __future__ import division, print_function, unicode_literals
+
+from legi.html import clean_html
+
+
+def test_clean_html_on_empty_string():
+    r = clean_html('')
+    assert r == ''
+
+
+def test_clean_html_on_single_whitespace():
+    r = clean_html(' ')
+    assert r == ''
+
+
+def test_clean_html_collapses_spaces():
+    unclean = '<s> Lorem \r <b><i> ipsum</i> dolor\n\t</b>sit </s>'
+    cleaned = clean_html(unclean)
+    expected = '<s>Lorem <b><i>ipsum</i> dolor</b> sit</s>'
+    assert cleaned == expected
+
+
+def test_clean_html_drops_spaces_around_line_breaks():
+    # Basic
+    unclean = '<p>\t Lorem ipsum\n </p>'
+    cleaned = clean_html(unclean)
+    expected = '<p>Lorem ipsum</p>'
+    assert cleaned == expected
+    # Complex
+    unclean = '<p> <i> \nLorem <br/> ipsum\n </i> </p>'
+    cleaned = clean_html(unclean)
+    expected = '<p><i>Lorem<br/>ipsum</i></p>'
+    assert cleaned == expected
+
+
+def test_clean_html_drops_bad_spaces():
+    unclean = "L' <span>article 2</span>\n."
+    cleaned = clean_html(unclean)
+    expected = "L'article 2."
+    assert cleaned == expected
+
+
+def test_clean_html_drops_empty_elements_and_text_nodes():
+    unclean = '''
+        <p>Lorem ipsum</p>
+        <p> <pre> </pre> </p>
+    '''
+    cleaned = clean_html(unclean)
+    expected = '<p>Lorem ipsum</p>'
+    assert cleaned == expected
+
+
+def test_clean_html_drops_line_breaks_at_the_beginning():
+    unclean = ' <br/> <p> <br/> <br/> Text</p>'
+    cleaned = clean_html(unclean)
+    expected = '<p>Text</p>'
+    assert cleaned == expected
+
+
+def test_clean_html_does_not_drop_empty_table_cells():
+    unclean = '<tr><th></th><td> </td></tr><tr> </tr>'
+    cleaned = clean_html(unclean)
+    expected = '<tr><th/><td/></tr>'
+    assert cleaned == expected
+
+
+def test_clean_html_drops_useless_attributes_and_elements():
+    unclean = '''
+        <h1 align="center">Titre <font>1</font></h1>
+        <p id="foo"><span align="left"></span></p>
+    '''
+    cleaned = clean_html(unclean)
+    expected = '<h1 align="center">Titre 1</h1>'
+    assert cleaned == expected
+
+
+def test_clean_html_does_not_alter_clean_html():
+    expected = '<h1 align="center">Titre</h1><p>Lorem ipsum &amp;</p>'
+    actual = clean_html(expected)
+    assert actual == expected
+
+
+def test_clean_html_does_not_collapse_spaces_inside_pre():
+    unclean = '''
+        <pre>    print("&gt; Hello world")
+        </pre>
+    '''
+    actual = clean_html(unclean)
+    expected = unclean.strip()
+    assert actual == expected
+
+
+def test_clean_html_escapes_properly():
+    original = '<p attr="&quot;">&lt;p&gt;</p>'
+    actual = clean_html(original)
+    expected = '''<p attr="&#34;">&lt;p&gt;</p>'''
+    assert actual == expected
+
+
+def test_clean_html_preserves_attribute_order():
+    expected = '<h1 a="0" b="1" c="2" d="3" e="4">Titre</h1>'
+    actual = clean_html(expected)
+    assert actual == expected
diff --git a/tests/test_innerHTML.py b/tests/test_innerHTML.py
new file mode 100644
index 0000000..f9416de
--- /dev/null
+++ b/tests/test_innerHTML.py
@@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import division, print_function, unicode_literals
+
+from lxml import etree
+
+from legi.tar2sqlite import innerHTML
+
+
+def test_innerHTML():
+    el = etree.fromstring('<root></root>')
+    assert innerHTML(el) == ''
+    el = etree.fromstring('<root>text</root> ')
+    assert innerHTML(el) == 'text'
+    el = etree.fromstring('<root >text</root>')
+    assert innerHTML(el) == 'text'
+    el = etree.fromstring('<root attr="value"> </root>')
+    assert innerHTML(el) == ' '
diff --git a/tox.ini b/tox.ini
index 48d1049..9b7a7ab 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,10 +4,10 @@ skipsdist=True
 
 [testenv]
 commands=
+    pip install -q -r requirements.txt
     python -m pytest {toxinidir}/tests --cov legi --cov-report=term-missing {posargs}
     flake8 {toxinidir}
 deps=
-    -rrequirements.txt
     flake8
     pytest
     pytest-cov