From ff93a76da27d6bb22d8fb29e5ab992e5bdfd9acd Mon Sep 17 00:00:00 2001 From: B-Souty Date: Mon, 23 Jul 2018 15:11:58 +0200 Subject: [PATCH 01/18] Add docstrings --- html2dict.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/html2dict.py b/html2dict.py index ef98557..f1d6740 100755 --- a/html2dict.py +++ b/html2dict.py @@ -2,6 +2,36 @@ class Html2Dict(object): + """Html to dictionaries extractor class + + A simple html tables extractor. + + Args: + html_string (str): String representation of an html. + url (str): Url of the website you are parsing + + Attributes: + html_string (str): String representation of an html. + url (str): Url of the website you are parsing + _tree (HtmlElement): Html tree from the root of the provided html_string. + _table_presents (:obj:`list` of :obj:`dict`): List of tables present in the + html_string as html element. + tables (dict): dict of all the table present on the page (structure in Notes). + + Notes: + Structure of a 'tables' dict: + dict( + table_n: dict( + header_rows: list( + header_row (:obj:`HtmlElement`), + ), + data_rows: list( + data_row (:obj:`HtmlElement`), + ) + ) + ) + + """ def __init__(self, html_string, url=None): @@ -14,6 +44,15 @@ def __init__(self, html_string, url=None): self.tables = self._extract_tables() def _extract_tables(self): + """ + Hidden method to initialize the self.tables attribute. + + Iterates over the tables in self._table_presents and returns a dict of + the extracted header and data rows for each tables. + + Returns: + tables (dict): this populate the tables attribute. For the structure please + """ tables = {} @@ -38,6 +77,15 @@ def _extract_tables(self): @staticmethod def is_header(row): + """For a given html row , returns True if all cells are header cells . + + Args: + row (HtmlElement): Any html row . + + Returns: + bool: + + """ if not row.xpath('*'): return False @@ -51,7 +99,18 @@ def is_header(row): @staticmethod def get_text_content(cell, is_header=False): + """Get the text content of an html cell + + Extract the text content of a cell in a html table. If the cell is part of a + merged header, join its text with a "/" with the text of the cell below it. + Args: + cell (HtmlElement): Html cell or + is_header (:obj:`bool`, optional): Is the cell a header. Default to False. + + Returns: + _ (str): Text contect at the root of an html cell. + """ # base case colspan = int(cell.attrib.get('colspan', 1)) if (colspan > 1 or cell.attrib.get('Html2Dict_merged') == "True") and is_header: @@ -64,7 +123,31 @@ def get_text_content(cell, is_header=False): @staticmethod def basic_table(table): + """ Transform a raw table to a slightly more advanced table. + + Take a dict representation of a table with raw html elements as formatted in + self.tables (c.f. 'Notes' section of the class.) in input and returns a new + dict representation of it with the text content of those html elements. + Args: + table (dict): For the structure, please see the 'Notes' section of the class. + + Returns: + _ (dict): For the structure, please see the 'Notes' section of this method. + + Notes: + Structure of the returned dict: + dict( + headers: list( + headers (str) or None + ), + data_rows: list( + data_rows (:obj:`list` of :obj:`str`) + ) + ) + + + """ copy_table = table.copy() header_rows = copy_table['header_rows'] data_rows = copy_table['data_rows'] @@ -93,7 +176,24 @@ def basic_table(table): return {'headers': tmp_headers[0], 'data_rows': tmp_data_rows} def basic_tables(self): + """The most basic tables parser. + Returns: + my_basic_tables (dict): For the structure, please see the 'Notes' section + of this method. + + Notes: + Structure of my_basic_tables: + If headers are found a row is a dict {header: data}. Otherwise a row is + a list of data. + + dict( + table_n: tuple( + row (:obj:`dict` or :obj:`list`) + ) + ) + + """ my_basic_tables = {} for my_table in self.tables: try: @@ -113,7 +213,7 @@ def basic_tables(self): return my_basic_tables def rich_tables(self): - + """Coming soon.""" raise NotImplementedError('This feature is coming soon.') if __name__ == '__main__': From bb2fc1e12e934eaf5f11eeddde24b42331e72711 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Mon, 23 Jul 2018 15:18:40 +0200 Subject: [PATCH 02/18] Fix formatting --- html2dict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html2dict.py b/html2dict.py index f1d6740..8cb9bb9 100755 --- a/html2dict.py +++ b/html2dict.py @@ -44,14 +44,14 @@ def __init__(self, html_string, url=None): self.tables = self._extract_tables() def _extract_tables(self): - """ - Hidden method to initialize the self.tables attribute. + """Hidden method to initialize the self.tables attribute. Iterates over the tables in self._table_presents and returns a dict of the extracted header and data rows for each tables. Returns: tables (dict): this populate the tables attribute. For the structure please + """ tables = {} @@ -110,6 +110,7 @@ def get_text_content(cell, is_header=False): Returns: _ (str): Text contect at the root of an html cell. + """ # base case colspan = int(cell.attrib.get('colspan', 1)) @@ -146,7 +147,6 @@ def basic_table(table): ) ) - """ copy_table = table.copy() header_rows = copy_table['header_rows'] From ee74655a4e2e08c20d8ea6d2c0a68a458ffb0678 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Mon, 23 Jul 2018 20:00:00 +0200 Subject: [PATCH 03/18] Create CODE_OF_CONDUCT.md --- CODE_OF_CONDUCT.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..eb30787 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +## Code of conduct + +Be respectful with each other.. + +That shouldn't be too hard right :smiley: From 6de0dbfdd90fafa09864510d9a4d57b4c9ec5e6a Mon Sep 17 00:00:00 2001 From: B-Souty Date: Thu, 26 Jul 2018 23:10:43 +0200 Subject: [PATCH 04/18] Update Readme --- README.MD | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/README.MD b/README.MD index e65dd08..5f69e20 100644 --- a/README.MD +++ b/README.MD @@ -1,3 +1,6 @@ +#### ⚠️Warning: This script is not ready for production use.⚠️ +*Not all tables are parseable yet. Please refer to the "Capabilities" section for a list of supported table types.* + # Html2Dict Simple html tables extractor. @@ -12,6 +15,20 @@ Simple html tables extractor. 1. `pip install html2dict` +## Capabilities + +List of table types currently supported: + * Basic table without headers (returns a tuple of rows as simple list) + * Basic table with headers (returns a tuple of dictionaries) + * Complex tables with merged headers (returns a tuple of dictionaries) + +List of table types **not** currently supported: + * Any tables embedded in iframes. + * Tables with vertical headers (scope=“col”) + * Tables with new header row after first set of data. + +This project is still very new, if the type of table you are parsing is not in this, please let me know the outcome. + ## Usage * Start by instantiating the class with an html string. (I used requests in this example but opening an html file would work just fine) @@ -114,7 +131,4 @@ extractor = Html2Dict(html_string=my_website.text) ... 'Operating System': 'Windows', ... 'Version': 'Windows x86 web-based installer'})} - - - -``` \ No newline at end of file +``` From 601e70fdc893f89133a821753ef7ee343124df90 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Thu, 26 Jul 2018 23:29:52 +0200 Subject: [PATCH 05/18] Improve docstring --- html2dict.py | 56 +++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/html2dict.py b/html2dict.py index 8cb9bb9..fadde95 100755 --- a/html2dict.py +++ b/html2dict.py @@ -20,16 +20,18 @@ class Html2Dict(object): Notes: Structure of a 'tables' dict: - dict( - table_n: dict( - header_rows: list( - header_row (:obj:`HtmlElement`), - ), - data_rows: list( - data_row (:obj:`HtmlElement`), - ) - ) - ) + { + table_n: { + header_rows: [ + header_row, + ... + ], + data_rows: [ + data_row, + ... + ] + } + } """ @@ -50,7 +52,8 @@ def _extract_tables(self): the extracted header and data rows for each tables. Returns: - tables (dict): this populate the tables attribute. For the structure please + dict: this populate the tables attribute. For the structure please + refer to the "Notes" section of the class. """ @@ -83,7 +86,7 @@ def is_header(row): row (HtmlElement): Any html row . Returns: - bool: + True if the row is only made of 'header' cells (). """ @@ -109,7 +112,7 @@ def get_text_content(cell, is_header=False): is_header (:obj:`bool`, optional): Is the cell a header. Default to False. Returns: - _ (str): Text contect at the root of an html cell. + str: Text content at the root of an html cell. """ # base case @@ -134,18 +137,18 @@ def basic_table(table): table (dict): For the structure, please see the 'Notes' section of the class. Returns: - _ (dict): For the structure, please see the 'Notes' section of this method. + dict: For the structure, please see the 'Notes' section of this method. Notes: Structure of the returned dict: - dict( - headers: list( - headers (str) or None - ), - data_rows: list( - data_rows (:obj:`list` of :obj:`str`) - ) - ) + { + headers: [ + headers or None + ], + data_rows: [ + data_rows + ] + } """ copy_table = table.copy() @@ -179,19 +182,18 @@ def basic_tables(self): """The most basic tables parser. Returns: - my_basic_tables (dict): For the structure, please see the 'Notes' section - of this method. + dict: For the structure, please see the 'Notes' section of this method. Notes: Structure of my_basic_tables: If headers are found a row is a dict {header: data}. Otherwise a row is a list of data. - dict( + { table_n: tuple( - row (:obj:`dict` or :obj:`list`) + row (:obj:`dict` or :obj:`list`), ) - ) + } """ my_basic_tables = {} From c32aa2f4536a8ba672da9a0d19031808cd2950c3 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Fri, 27 Jul 2018 00:52:39 +0200 Subject: [PATCH 06/18] Add feature #4 If a 'caption' tag is present in the table use this as the key name ine the dictionary. Otherwise, use the default 'table_n' naming convention. --- html2dict.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/html2dict.py b/html2dict.py index 7989c81..fcb3c58 100755 --- a/html2dict.py +++ b/html2dict.py @@ -61,6 +61,12 @@ def _extract_tables(self): for ind_table, table in enumerate(self._table_presents): + if table.xpath('caption'): + caption = table.xpath('caption')[0] + table_name = self.get_text_content(caption) + else: + table_name = "table_{}".format(ind_table) + my_header_rows = [] my_data_rows = [] t_body = table.xpath('*//tr') or table.xpath('tr') @@ -72,7 +78,7 @@ def _extract_tables(self): else: my_data_rows.append(row) - tables["table_{}".format(ind_table)] = { + tables[table_name] = { "header_rows" : my_header_rows, "data_rows": my_data_rows, } From 323cb649a02f8806a6642f6e1f11bb744e13f30e Mon Sep 17 00:00:00 2001 From: B-Souty Date: Fri, 27 Jul 2018 01:29:45 +0200 Subject: [PATCH 07/18] Update README.MD --- README.MD | 1 + 1 file changed, 1 insertion(+) diff --git a/README.MD b/README.MD index 5f69e20..28a72f4 100644 --- a/README.MD +++ b/README.MD @@ -26,6 +26,7 @@ List of table types **not** currently supported: * Any tables embedded in iframes. * Tables with vertical headers (scope=“col”) * Tables with new header row after first set of data. + * Tables with merged tables accross multiple levels This project is still very new, if the type of table you are parsing is not in this, please let me know the outcome. From d86f0a078e462bc22a39aefdad613e430333364d Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 03:14:54 +0200 Subject: [PATCH 08/18] Complete code refactor --- html2dict.py | 405 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 293 insertions(+), 112 deletions(-) mode change 100755 => 100644 html2dict.py diff --git a/html2dict.py b/html2dict.py old mode 100755 new mode 100644 index fcb3c58..71429ce --- a/html2dict.py +++ b/html2dict.py @@ -1,41 +1,142 @@ from lxml import html +import requests -class Html2Dict(object): - """Html to dictionaries extractor class +class Table(object): + """Base table object. - A simple html tables extractor. + A Table object holds information about a table, including its name, + headers row and data rows. - Args: - html_string (str): String representation of an html. - url (str): Url of the website you are parsing + Attributes: + name (str): Name of the table. + header_rows (list): A list of headers. If the table doesn't + contains headers, default ones will be generated. + data_rows (list): Data rows of your table represented as a list + of dictionary. + rows (dict): Headers and data rows together in a dictionary. + + """ + + def __init__(self, data_rows: list, header_rows: list, name=None): + """__init__ method. + + Args: + name (str, optional): Name of the table. Default to None. + header_rows (list): A list of headers. + data_rows (list): Data rows of your table represented as a + list of dictionary. + + """ + + self.name = name + self.header_rows = header_rows + self.data_rows = data_rows + self.rows = { + "headers": self.header_rows, + "data": self.data_rows, + } + + @classmethod + def from_html_element(cls, table, table_name=None, caption_name_overwrite=False): + """Classmethod to extract a table from a HTML element. + + This clasmethod is used by the Extractor class to extract the + tables on a webpage. + + Args: + table (:obj:`lxml.html.HtmlElement`): A
HTML element. + table_name (str, optional): A table name. Defaults to None. + caption_name_overwrite (bool, optional): If True, if a table + name is provided but a table caption is found, the table + caption will be used as the name instead. + + Returns: + Table: A Table object + + """ + + header_rows = [] + data_rows = [] + + if table.xpath('caption'): + caption = table.xpath('caption')[0] + table_name = TableExtractor.get_text_content(caption) + elif table_name and not caption_name_overwrite: + table_name = table_name + + t_body = table.xpath('*//tr') or table.xpath('tr') + + for row in t_body: + + if TableExtractor.is_header(row): + header_rows.append(row) + else: + data_rows.append(row) + + return cls( + name=table_name, + data_rows=data_rows, + header_rows=header_rows, + ) + + def search(self, query, column=False): + """Search a value in your data rows. + + Search if a value is present anywhere in your table or in a + specific column. + + Args: + query : Value to search + column (str, optional): Column name. Search only in this + column. Default to None. + + Returns: + list: Rows containing the searched value. + + """ + + if column: + + try: + return [row for row in self.data_rows if query == row[column]] + except KeyError as e: + raise KeyError( + f"'{column}' is not a valid header. Valid headers are {self.header_rows}" + ) + + return [row for row in self.data_rows if query in row.values()] + + +class TableExtractor(object): + """Html to dictionaries extractor class + + This is the skeleton Extractor class. Attributes: html_string (str): String representation of an html. url (str): Url of the website you are parsing - _tree (HtmlElement): Html tree from the root of the provided html_string. - _table_presents (:obj:`list` of :obj:`dict`): List of tables present in the - html_string as html element. - tables (dict): dict of all the table present on the page (structure in Notes). - - Notes: - Structure of a 'tables' dict: - { - table_n: { - header_rows: [ - header_row, - ... - ], - data_rows: [ - data_row, - ... - ] - } - } + raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables + present on the page as raw HTML data and headers (
& ). + _tree (:obj:`HtmlElement`): Html tree from the root of the + provided html_string. + _table_presents (:obj:`list` of :obj:`dict`): List of tables + present in the html_string as html element . """ - def __init__(self, html_string, url=None): + def __init__(self, html_string: str, url=None): + """__init__ method. + + Args: + html_string (str): String representation of an html. + url (str, optional): Url of the website you are parsing. + + Notes: + It is not recommended to instantiate a class manually. Use + instead one of the clasmethod provided. + + """ self.html_string = html_string self._tree = html.fromstring(self.html_string) @@ -43,53 +144,90 @@ def __init__(self, html_string, url=None): if not self.url and self._tree.xpath('//link[@rel="canonical"]'): self.url = self._tree.xpath('//link[@rel="canonical"]')[0].get('href') self._table_presents = self._tree.xpath('//table') - self.tables = self._extract_tables() + self.raw_tables = self._extract_raw_tables() - def _extract_tables(self): - """Hidden method to initialize the self.tables attribute. + def _extract_raw_tables(self): + """Hidden method to initialize the self.raw_tables attribute. - Iterates over the tables in self._table_presents and returns a dict of - the extracted header and data rows for each tables. + Iterates over the tables in self._table_presents and returns a + dict of the extracted tables. Returns: - dict: this populate the tables attribute. For the structure please - refer to the "Notes" section of the class. + dict: All the tables found in the html string as a dictionary + of Table object with raw HTML elements for data and + headers. """ tables = {} for ind_table, table in enumerate(self._table_presents): + my_table = Table.from_html_element( + table=table, + table_name=f"table_{ind_table}", + caption_name_overwrite=True, + ) - if table.xpath('caption'): - caption = table.xpath('caption')[0] - table_name = self.get_text_content(caption) - else: - table_name = "table_{}".format(ind_table) + tables[my_table.name] = my_table - my_header_rows = [] - my_data_rows = [] - t_body = table.xpath('*//tr') or table.xpath('tr') + return tables - for row in t_body: + @classmethod + def from_html_string(cls, html_string, url=None): + """Instantiate an object from an html string. - if Html2Dict.is_header(row): - my_header_rows.append(row) - else: - my_data_rows.append(row) + Args: + html_string (str): String representation of an html + url (str, optional): Url of the website the string is coming + from. Default to None - tables[table_name] = { - "header_rows" : my_header_rows, - "data_rows": my_data_rows, - } - return tables + Returns: + TableExtractor: The newly created TableExtractor + + """ + + return cls(html_string=html_string, url=url) + + @classmethod + def from_html_file(cls, html_file, url=None): + """Instantiate an object from an html file. + + Args: + html_file (str): relative filepath to an html file. + url (str, optional): Url of the website the file is coming + from. Default to None. + + Returns: + TableExtractor: The newly created TableExtractor + + """ + + with open(html_file, 'r') as infile: + html_string = infile.read() + + return cls(html_string=html_string, url=url) + + @classmethod + def from_url(cls, url, **kwargs): + """Instantiate an object from a url. + + Args: + url (str): Url of the website you are parsing. + + Returns: + TableExtractor: The newly created TableExtractor + + """ + + html_string = requests.get(url=url, **kwargs).text + return cls(html_string=html_string, url=url) @staticmethod def is_header(row): - """For a given html row , returns True if all cells are header cells . + row (HtmlElement): An html row . Returns: True if the row is only made of 'header' cells (
. + """Check if an html row is a header. Args: - row (HtmlElement): Any html row
). @@ -115,7 +253,7 @@ def get_text_content(cell, is_header=False): Args: cell (HtmlElement): Html cell or - is_header (:obj:`bool`, optional): Is the cell a header. Default to False. + is_header (bool, optional): Is the cell a header. Default to False. Returns: str: Text content at the root of an html cell. @@ -125,48 +263,67 @@ def get_text_content(cell, is_header=False): colspan = int(cell.attrib.get('colspan', 1)) # is_header = True if cell.tag == 'th' else False if (colspan > 1 or cell.attrib.get('Html2Dict_merged') == "True") and is_header: - cell.attrib['Html2Dict_merged'] = "True" - cell.attrib['colspan'] = str(colspan - 1) - next_cell_below = cell.getparent().getnext()[0] - cell.getparent().getnext().remove(next_cell_below) - cell_text = " ".join([i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" - cell_text = "/".join([ - cell_text, - Html2Dict.get_text_content(cell=next_cell_below, is_header=True) - ]) - return cell_text + cell.attrib['Html2Dict_merged'] = "True" + cell.attrib['colspan'] = str(colspan - 1) + next_cell_below = cell.getparent().getnext()[0] + cell.getparent().getnext().remove(next_cell_below) + cell_text = " ".join( + [i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" + cell_text = "/".join([ + cell_text, + TableExtractor.get_text_content(cell=next_cell_below, is_header=True) + ]) + return cell_text return " ".join([i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" - @staticmethod - def basic_table(table): + +class BasicTableExtractor(TableExtractor): + """Basic tables extractor. + + Attributes: + html_string (str): String representation of an html. + url (str): Url of the website you are parsing + raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables + present on the page as raw HTML data and headers ( & ). + basic_tables (:obj:`dict` of :obj:`Table`): dict of all the tables + present on the page as plaintext. + _tree (:obj:`HtmlElement`): Html tree from the root of the + provided html_string. + _table_presents (:obj:`list` of :obj:`dict`): List of tables + present in the html_string as html element . + + """ + + def __init__(self, html_string, url=None): + """__init_ method. + + Args: + html_string (str): String representation of an html. + url (str, optional): Url of the website you are parsing. + + """ + + super(BasicTableExtractor, self).__init__(html_string, url) + self.basic_tables = self.extract_basic_tables() + + def basic_table_parser(self, table: Table): """ Transform a raw table to a slightly more advanced table. - Take a dict representation of a table with raw html elements as formatted in - self.tables (c.f. 'Notes' section of the class.) in input and returns a new - dict representation of it with the text content of those html elements. + Take a Table object containing raw HTML elements and extract + basic text data from it. Args: - table (dict): For the structure, please see the 'Notes' section of the class. + table (:obj:`Table`): A Table object containing data as HTML + elements. Returns: - dict: For the structure, please see the 'Notes' section of this method. - - Notes: - Structure of the returned dict: - { - headers: [ - headers or None - ], - data_rows: [ - data_rows - ] - } + Table: A new Table object with its data represented in + plaintext. """ - copy_table = table.copy() - header_rows = copy_table['header_rows'] - data_rows = copy_table['data_rows'] - tmp_headers = [] + header_rows = table.header_rows + data_rows = table.data_rows + tmp_header_rows = [] tmp_data_rows = [] for row in header_rows + data_rows: @@ -176,42 +333,53 @@ def basic_table(table): colspan = int(cell.attrib.get('colspan', 1)) for _ in range(colspan): + if row in header_rows: - cell_text = Html2Dict.get_text_content(cell=cell, is_header=True) + cell_text = self.get_text_content(cell=cell, is_header=True) else: - cell_text = Html2Dict.get_text_content(cell=cell) + cell_text = self.get_text_content(cell=cell) tmp_row.append(cell_text) + if row in header_rows: - tmp_headers.append(tmp_row) + tmp_header_rows.append(tmp_row) else: tmp_data_rows.append(tmp_row) - if not tmp_headers: - tmp_headers = [None] - return {'headers': tmp_headers[0], 'data_rows': tmp_data_rows} - def basic_tables(self): - """The most basic tables parser. + if not tmp_header_rows: - Returns: - dict: For the structure, please see the 'Notes' section of this method. + tmp_data_rows = [ + {f"col_{ind}": item for ind, item in enumerate(row)} + for row in tmp_data_rows + ] + tmp_header_rows = sorted( + {header for row in tmp_data_rows for header in row} + ) - Notes: - Structure of my_basic_tables: - If headers are found a row is a dict {header: data}. Otherwise a row is - a list of data. + else: + tmp_data_rows = [ + dict(zip(tmp_header_rows[0], row)) + for row in tmp_data_rows + ] + + return Table(data_rows=tmp_data_rows, header_rows=tmp_header_rows) - { - table_n: tuple( - row (:obj:`dict` or :obj:`list`), - ) - } + def extract_basic_tables(self): + """Basic tables parser. + + Loop over the extracted raw_tables and pass them through the + basic_table_parser. + + Returns: + dict: All the tables found in the html string as a dictionary + of Table object with data and headers as plaintext. """ + my_basic_tables = {} - for my_table in self.tables: + for my_table in self.raw_tables: try: - my_table_basic = Html2Dict.basic_table(self.tables[my_table]) + basic_table = self.basic_table_parser(self.raw_tables[my_table]) except Exception as e: error = """ An error occured with {0}: @@ -221,14 +389,27 @@ def basic_tables(self): """.format(my_table, e) print(error) continue - headers = my_table_basic.get('headers') - my_basic_tables[my_table] = tuple(dict(zip(headers, row)) if headers else row for row in my_table_basic.get('data_rows')) + + my_basic_tables[my_table] = basic_table return my_basic_tables - def rich_tables(self): - """Coming soon.""" - raise NotImplementedError('This feature is coming soon.') + +class RichTableExtractor(TableExtractor): + """ Rich tables extractor. + + Notes: + This class is not implemented yet but I am working on it. + The goal of it is to return more than just plaintext data. + For example if a cell contains an HTML list
  • , I should + retrieve it as a Python list or if a cell has a link, I should + retrieve something like [some_text](my_link). + + """ + + def __init__(self): + raise NotImplementedError("Placeholder class. Feature coming soon..") + if __name__ == '__main__': pass From bb556f3d73b2a1017b7ca9519de4643775fd0976 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 03:17:43 +0200 Subject: [PATCH 09/18] Include requests to dependencies --- requirements.txt | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 86c871e..7cdc7ea 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -lxml \ No newline at end of file +lxml +requests \ No newline at end of file diff --git a/setup.py b/setup.py index 87903e2..ed1abb0 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ REQUIRED = [ 'lxml', + 'requests' ] try: From 680abd2e51a167624592b647a1c15da83e9abe68 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 03:18:26 +0200 Subject: [PATCH 10/18] Update README --- README.MD | 204 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 107 insertions(+), 97 deletions(-) diff --git a/README.MD b/README.MD index 28a72f4..a15e677 100644 --- a/README.MD +++ b/README.MD @@ -1,4 +1,4 @@ -#### ⚠️Warning: This script is not ready for production use.⚠️ +#### ⚠Warning: This script is not ready for production use.⚠ *Not all tables are parseable yet. Please refer to the "Capabilities" section for a list of supported table types.* # Html2Dict @@ -10,17 +10,19 @@ Simple html tables extractor. * Python 3.6+ * Python module: * [lxml](https://lxml.de/) + * [requests](http://docs.python-requests.org/en/master/) ## Installing -1. `pip install html2dict` +Create and activate a new Python virtual environment then install this dev branch with: + * `pip3 install git+https://github.com/B-Souty/html2dict@wip/issue2/main` ## Capabilities List of table types currently supported: - * Basic table without headers (returns a tuple of rows as simple list) - * Basic table with headers (returns a tuple of dictionaries) - * Complex tables with merged headers (returns a tuple of dictionaries) + * Basic table without headers. + * Basic table with headers. + * Complex tables with merged headers. List of table types **not** currently supported: * Any tables embedded in iframes. @@ -28,108 +30,116 @@ List of table types **not** currently supported: * Tables with new header row after first set of data. * Tables with merged tables accross multiple levels -This project is still very new, if the type of table you are parsing is not in this, please let me know the outcome. +This project is still very new, if the type of table you are parsing is not in this list, please let me know the outcome. ## Usage -* Start by instantiating the class with an html string. (I used requests in this example but opening an html file would work just fine) +Start by importing the desired type of extractor. (Only one available currently). ```Python -from html2dict import Html2Dict -import requests +from html2dict import BasicTableExtractor +``` + +Then instantiate an object with one of the 3 constructors provided +```python +my_extractor = BasicTableExtractor.from_html_string(html_string=) + +# or + +my_extractor = BasicTableExtractor.from_html_file(html_file=) + +# or -my_website = requests.get(url="https://www.python.org/downloads/release/python-370/") -extractor = Html2Dict(html_string=my_website.text) +my_extractor = BasicTableExtractor.from_url(url=) ``` -* The object starts with an attribute 'tables' containing all the tables in the html provided as raw html elements. +You can access the extracted tables from the basic_tables attribute. + +For example for https://www.python.org/downloads/release/python-370/ ```python ->>> extractor.tables - -...{'table_0': {'data_rows': [, -... , -... , -... , -... , -... , -... , -... , -... , -... , -... ], -... 'header_rows': []}} +my_extractor.basic_tables + +{'table_0': } ``` - * The only table extractor method implemented so far is 'basic_tables'. It returns a dict of table where each table is a tuple of dict if the base table had headers otherwise it is a simple list. - - ```python ->>> extractor.basic_tables() - -...{'table_0': ({'Description': 'n/a', -... 'File Size': '22745726', -... 'GPG': 'SIG', -... 'MD5 Sum': '41b6595deb4147a1ed517a7d9a580271', -... 'Operating System': 'Source release', -... 'Version': 'Gzipped source tarball'}, -... {'Description': 'n/a', -... 'File Size': '16922100', -... 'GPG': 'SIG', -... 'MD5 Sum': 'eb8c2a6b1447d50813c02714af4681f3', -... 'Operating System': 'Source release', -... 'Version': 'XZ compressed source tarball'}, -... {'Description': 'for Mac OS X 10.6 and later', -... 'File Size': '34274481', -... 'GPG': 'SIG', -... 'MD5 Sum': 'ca3eb84092d0ff6d02e42f63a734338e', -... 'Operating System': 'Mac OS X', -... 'Version': 'macOS 64-bit/32-bit installer'}, -... {'Description': 'for OS X 10.9 and later', -... 'File Size': '27651276', -... 'GPG': 'SIG', -... 'MD5 Sum': 'ae0717a02efea3b0eb34aadc680dc498', -... 'Operating System': 'Mac OS X', -... 'Version': 'macOS 64-bit installer'}, -... {'Description': 'n/a', -... 'File Size': '8547689', -... 'GPG': 'SIG', -... 'MD5 Sum': '46562af86c2049dd0cc7680348180dca', -... 'Operating System': 'Windows', -... 'Version': 'Windows help file'}, -... {'Description': 'for AMD64/EM64T/x64', -... 'File Size': '6946082', -... 'GPG': 'SIG', -... 'MD5 Sum': 'cb8b4f0d979a36258f73ed541def10a5', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86-64 embeddable zip file'}, -... {'Description': 'for AMD64/EM64T/x64', -... 'File Size': '26262280', -... 'GPG': 'SIG', -... 'MD5 Sum': '531c3fc821ce0a4107b6d2c6a129be3e', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86-64 executable installer'}, -... {'Description': 'for AMD64/EM64T/x64', -... 'File Size': '1327160', -... 'GPG': 'SIG', -... 'MD5 Sum': '3cfdaf4c8d3b0475aaec12ba402d04d2', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86-64 web-based installer'}, -... {'Description': 'n/a', -... 'File Size': '6395982', -... 'GPG': 'SIG', -... 'MD5 Sum': 'ed9a1c028c1e99f5323b9c20723d7d6f', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86 embeddable zip file'}, -... {'Description': 'n/a', -... 'File Size': '25506832', -... 'GPG': 'SIG', -... 'MD5 Sum': 'ebb6444c284c1447e902e87381afeff0', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86 executable installer'}, -... {'Description': 'n/a', -... 'File Size': '1298280', -... 'GPG': 'SIG', -... 'MD5 Sum': '779c4085464eb3ee5b1a4fffd0eabca4', -... 'Operating System': 'Windows', -... 'Version': 'Windows x86 web-based installer'})} +Finally, the data of the table can be accessed from the attributes data_rows or rows. + +```python +my_extractor.basic_tables['table_0'] + +pprint(test_html3.basic_tables['table_0'].rows) +{'data': [{'Description': 'n/a', + 'File Size': '22745726', + 'GPG': 'SIG', + 'MD5 Sum': '41b6595deb4147a1ed517a7d9a580271', + 'Operating System': 'Source release', + 'Version': 'Gzipped source tarball'}, + {'Description': 'n/a', + 'File Size': '16922100', + 'GPG': 'SIG', + 'MD5 Sum': 'eb8c2a6b1447d50813c02714af4681f3', + 'Operating System': 'Source release', + 'Version': 'XZ compressed source tarball'}, + {'Description': 'for Mac OS X 10.6 and later', + 'File Size': '34274481', + 'GPG': 'SIG', + 'MD5 Sum': 'ca3eb84092d0ff6d02e42f63a734338e', + 'Operating System': 'Mac OS X', + 'Version': 'macOS 64-bit/32-bit installer'}, + {'Description': 'for OS X 10.9 and later', + 'File Size': '27651276', + 'GPG': 'SIG', + 'MD5 Sum': 'ae0717a02efea3b0eb34aadc680dc498', + 'Operating System': 'Mac OS X', + 'Version': 'macOS 64-bit installer'}, + {'Description': 'n/a', + 'File Size': '8547689', + 'GPG': 'SIG', + 'MD5 Sum': '46562af86c2049dd0cc7680348180dca', + 'Operating System': 'Windows', + 'Version': 'Windows help file'}, + {'Description': 'for AMD64/EM64T/x64', + 'File Size': '6946082', + 'GPG': 'SIG', + 'MD5 Sum': 'cb8b4f0d979a36258f73ed541def10a5', + 'Operating System': 'Windows', + 'Version': 'Windows x86-64 embeddable zip file'}, + {'Description': 'for AMD64/EM64T/x64', + 'File Size': '26262280', + 'GPG': 'SIG', + 'MD5 Sum': '531c3fc821ce0a4107b6d2c6a129be3e', + 'Operating System': 'Windows', + 'Version': 'Windows x86-64 executable installer'}, + {'Description': 'for AMD64/EM64T/x64', + 'File Size': '1327160', + 'GPG': 'SIG', + 'MD5 Sum': '3cfdaf4c8d3b0475aaec12ba402d04d2', + 'Operating System': 'Windows', + 'Version': 'Windows x86-64 web-based installer'}, + {'Description': 'n/a', + 'File Size': '6395982', + 'GPG': 'SIG', + 'MD5 Sum': 'ed9a1c028c1e99f5323b9c20723d7d6f', + 'Operating System': 'Windows', + 'Version': 'Windows x86 embeddable zip file'}, + {'Description': 'n/a', + 'File Size': '25506832', + 'GPG': 'SIG', + 'MD5 Sum': 'ebb6444c284c1447e902e87381afeff0', + 'Operating System': 'Windows', + 'Version': 'Windows x86 executable installer'}, + {'Description': 'n/a', + 'File Size': '1298280', + 'GPG': 'SIG', + 'MD5 Sum': '779c4085464eb3ee5b1a4fffd0eabca4', + 'Operating System': 'Windows', + 'Version': 'Windows x86 web-based installer'}], + 'headers': [['Version', + 'Operating System', + 'Description', + 'MD5 Sum', + 'File Size', + 'GPG']]} + ``` From a989edc85eff940c74df33b49fc0967b9d982881 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 12:26:01 +0200 Subject: [PATCH 11/18] Split module in files in a package. --- html2dict.py | 415 ------------------------------------ html2dict/__init__.py | 0 html2dict/base_extractor.py | 126 +++++++++++ html2dict/extractors.py | 132 ++++++++++++ html2dict/resources.py | 156 ++++++++++++++ setup.py | 2 +- 6 files changed, 415 insertions(+), 416 deletions(-) delete mode 100644 html2dict.py create mode 100644 html2dict/__init__.py create mode 100644 html2dict/base_extractor.py create mode 100644 html2dict/extractors.py create mode 100644 html2dict/resources.py diff --git a/html2dict.py b/html2dict.py deleted file mode 100644 index 71429ce..0000000 --- a/html2dict.py +++ /dev/null @@ -1,415 +0,0 @@ -from lxml import html -import requests - - -class Table(object): - """Base table object. - - A Table object holds information about a table, including its name, - headers row and data rows. - - Attributes: - name (str): Name of the table. - header_rows (list): A list of headers. If the table doesn't - contains headers, default ones will be generated. - data_rows (list): Data rows of your table represented as a list - of dictionary. - rows (dict): Headers and data rows together in a dictionary. - - """ - - def __init__(self, data_rows: list, header_rows: list, name=None): - """__init__ method. - - Args: - name (str, optional): Name of the table. Default to None. - header_rows (list): A list of headers. - data_rows (list): Data rows of your table represented as a - list of dictionary. - - """ - - self.name = name - self.header_rows = header_rows - self.data_rows = data_rows - self.rows = { - "headers": self.header_rows, - "data": self.data_rows, - } - - @classmethod - def from_html_element(cls, table, table_name=None, caption_name_overwrite=False): - """Classmethod to extract a table from a
  • HTML element. - - This clasmethod is used by the Extractor class to extract the - tables on a webpage. - - Args: - table (:obj:`lxml.html.HtmlElement`): A
    HTML element. - table_name (str, optional): A table name. Defaults to None. - caption_name_overwrite (bool, optional): If True, if a table - name is provided but a table caption is found, the table - caption will be used as the name instead. - - Returns: - Table: A Table object - - """ - - header_rows = [] - data_rows = [] - - if table.xpath('caption'): - caption = table.xpath('caption')[0] - table_name = TableExtractor.get_text_content(caption) - elif table_name and not caption_name_overwrite: - table_name = table_name - - t_body = table.xpath('*//tr') or table.xpath('tr') - - for row in t_body: - - if TableExtractor.is_header(row): - header_rows.append(row) - else: - data_rows.append(row) - - return cls( - name=table_name, - data_rows=data_rows, - header_rows=header_rows, - ) - - def search(self, query, column=False): - """Search a value in your data rows. - - Search if a value is present anywhere in your table or in a - specific column. - - Args: - query : Value to search - column (str, optional): Column name. Search only in this - column. Default to None. - - Returns: - list: Rows containing the searched value. - - """ - - if column: - - try: - return [row for row in self.data_rows if query == row[column]] - except KeyError as e: - raise KeyError( - f"'{column}' is not a valid header. Valid headers are {self.header_rows}" - ) - - return [row for row in self.data_rows if query in row.values()] - - -class TableExtractor(object): - """Html to dictionaries extractor class - - This is the skeleton Extractor class. - - Attributes: - html_string (str): String representation of an html. - url (str): Url of the website you are parsing - raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables - present on the page as raw HTML data and headers (
    & ). - _tree (:obj:`HtmlElement`): Html tree from the root of the - provided html_string. - _table_presents (:obj:`list` of :obj:`dict`): List of tables - present in the html_string as html element . - - """ - - def __init__(self, html_string: str, url=None): - """__init__ method. - - Args: - html_string (str): String representation of an html. - url (str, optional): Url of the website you are parsing. - - Notes: - It is not recommended to instantiate a class manually. Use - instead one of the clasmethod provided. - - """ - - self.html_string = html_string - self._tree = html.fromstring(self.html_string) - self.url = url - if not self.url and self._tree.xpath('//link[@rel="canonical"]'): - self.url = self._tree.xpath('//link[@rel="canonical"]')[0].get('href') - self._table_presents = self._tree.xpath('//table') - self.raw_tables = self._extract_raw_tables() - - def _extract_raw_tables(self): - """Hidden method to initialize the self.raw_tables attribute. - - Iterates over the tables in self._table_presents and returns a - dict of the extracted tables. - - Returns: - dict: All the tables found in the html string as a dictionary - of Table object with raw HTML elements for data and - headers. - - """ - - tables = {} - - for ind_table, table in enumerate(self._table_presents): - my_table = Table.from_html_element( - table=table, - table_name=f"table_{ind_table}", - caption_name_overwrite=True, - ) - - tables[my_table.name] = my_table - - return tables - - @classmethod - def from_html_string(cls, html_string, url=None): - """Instantiate an object from an html string. - - Args: - html_string (str): String representation of an html - url (str, optional): Url of the website the string is coming - from. Default to None - - Returns: - TableExtractor: The newly created TableExtractor - - """ - - return cls(html_string=html_string, url=url) - - @classmethod - def from_html_file(cls, html_file, url=None): - """Instantiate an object from an html file. - - Args: - html_file (str): relative filepath to an html file. - url (str, optional): Url of the website the file is coming - from. Default to None. - - Returns: - TableExtractor: The newly created TableExtractor - - """ - - with open(html_file, 'r') as infile: - html_string = infile.read() - - return cls(html_string=html_string, url=url) - - @classmethod - def from_url(cls, url, **kwargs): - """Instantiate an object from a url. - - Args: - url (str): Url of the website you are parsing. - - Returns: - TableExtractor: The newly created TableExtractor - - """ - - html_string = requests.get(url=url, **kwargs).text - return cls(html_string=html_string, url=url) - - @staticmethod - def is_header(row): - """Check if an html row is a header. - - Args: - row (HtmlElement): An html row . - - Returns: - True if the row is only made of 'header' cells (
    ). - - """ - - if not row.xpath('*'): - return False - - for elem in row.xpath('*'): - - if not elem.tag == 'th': - return False - - return True - - @staticmethod - def get_text_content(cell, is_header=False): - """Get the text content of an html cell - - Extract the text content of a cell in a html table. If the cell is part of a - merged header, join its text with a "/" with the text of the cell below it. - - Args: - cell (HtmlElement): Html cell or - is_header (bool, optional): Is the cell a header. Default to False. - - Returns: - str: Text content at the root of an html cell. - - """ - # base case - colspan = int(cell.attrib.get('colspan', 1)) - # is_header = True if cell.tag == 'th' else False - if (colspan > 1 or cell.attrib.get('Html2Dict_merged') == "True") and is_header: - cell.attrib['Html2Dict_merged'] = "True" - cell.attrib['colspan'] = str(colspan - 1) - next_cell_below = cell.getparent().getnext()[0] - cell.getparent().getnext().remove(next_cell_below) - cell_text = " ".join( - [i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" - cell_text = "/".join([ - cell_text, - TableExtractor.get_text_content(cell=next_cell_below, is_header=True) - ]) - return cell_text - return " ".join([i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" - - -class BasicTableExtractor(TableExtractor): - """Basic tables extractor. - - Attributes: - html_string (str): String representation of an html. - url (str): Url of the website you are parsing - raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables - present on the page as raw HTML data and headers ( & ). - basic_tables (:obj:`dict` of :obj:`Table`): dict of all the tables - present on the page as plaintext. - _tree (:obj:`HtmlElement`): Html tree from the root of the - provided html_string. - _table_presents (:obj:`list` of :obj:`dict`): List of tables - present in the html_string as html element . - - """ - - def __init__(self, html_string, url=None): - """__init_ method. - - Args: - html_string (str): String representation of an html. - url (str, optional): Url of the website you are parsing. - - """ - - super(BasicTableExtractor, self).__init__(html_string, url) - self.basic_tables = self.extract_basic_tables() - - def basic_table_parser(self, table: Table): - """ Transform a raw table to a slightly more advanced table. - - Take a Table object containing raw HTML elements and extract - basic text data from it. - - Args: - table (:obj:`Table`): A Table object containing data as HTML - elements. - - Returns: - Table: A new Table object with its data represented in - plaintext. - - """ - header_rows = table.header_rows - data_rows = table.data_rows - tmp_header_rows = [] - tmp_data_rows = [] - - for row in header_rows + data_rows: - - tmp_row = [] - for cell in row: - - colspan = int(cell.attrib.get('colspan', 1)) - for _ in range(colspan): - - if row in header_rows: - cell_text = self.get_text_content(cell=cell, is_header=True) - else: - cell_text = self.get_text_content(cell=cell) - - tmp_row.append(cell_text) - - if row in header_rows: - tmp_header_rows.append(tmp_row) - else: - tmp_data_rows.append(tmp_row) - - if not tmp_header_rows: - - tmp_data_rows = [ - {f"col_{ind}": item for ind, item in enumerate(row)} - for row in tmp_data_rows - ] - tmp_header_rows = sorted( - {header for row in tmp_data_rows for header in row} - ) - - else: - tmp_data_rows = [ - dict(zip(tmp_header_rows[0], row)) - for row in tmp_data_rows - ] - - return Table(data_rows=tmp_data_rows, header_rows=tmp_header_rows) - - def extract_basic_tables(self): - """Basic tables parser. - - Loop over the extracted raw_tables and pass them through the - basic_table_parser. - - Returns: - dict: All the tables found in the html string as a dictionary - of Table object with data and headers as plaintext. - - """ - - my_basic_tables = {} - for my_table in self.raw_tables: - try: - basic_table = self.basic_table_parser(self.raw_tables[my_table]) - except Exception as e: - error = """ - An error occured with {0}: - {1} - ***************** - Proceeding with next table - """.format(my_table, e) - print(error) - continue - - my_basic_tables[my_table] = basic_table - - return my_basic_tables - - -class RichTableExtractor(TableExtractor): - """ Rich tables extractor. - - Notes: - This class is not implemented yet but I am working on it. - The goal of it is to return more than just plaintext data. - For example if a cell contains an HTML list
  • , I should - retrieve it as a Python list or if a cell has a link, I should - retrieve something like [some_text](my_link). - - """ - - def __init__(self): - raise NotImplementedError("Placeholder class. Feature coming soon..") - - -if __name__ == '__main__': - pass diff --git a/html2dict/__init__.py b/html2dict/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/html2dict/base_extractor.py b/html2dict/base_extractor.py new file mode 100644 index 0000000..79b5715 --- /dev/null +++ b/html2dict/base_extractor.py @@ -0,0 +1,126 @@ +from lxml import html +import requests +from html2dict.resources import * + + +__all__ = [ + 'TableExtractor', + 'Table', + 'get_text_content', + 'is_header' +] + +class TableExtractor(object): + """Html to dictionaries extractor class + + This is the skeleton Extractor class. + + Attributes: + html_string (str): String representation of an html. + url (str): Url of the website you are parsing + raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables + present on the page as raw HTML data and headers (
  • & ). + _tree (:obj:`HtmlElement`): Html tree from the root of the + provided html_string. + _table_presents (:obj:`list` of :obj:`dict`): List of tables + present in the html_string as html element . + + """ + + def __init__(self, html_string: str, url=None): + """__init__ method. + + Args: + html_string (str): String representation of an html. + url (str, optional): Url of the website you are parsing. + + Notes: + It is not recommended to instantiate a class manually. Use + instead one of the clasmethod provided. + + """ + + self.html_string = html_string + self._tree = html.fromstring(self.html_string) + self.url = url + if not self.url and self._tree.xpath('//link[@rel="canonical"]'): + self.url = self._tree.xpath('//link[@rel="canonical"]')[0].get('href') + self._table_presents = self._tree.xpath('//table') + self.raw_tables = self._extract_raw_tables() + + def _extract_raw_tables(self): + """Hidden method to initialize the self.raw_tables attribute. + + Iterates over the tables in self._table_presents and returns a + dict of the extracted tables. + + Returns: + dict: All the tables found in the html string as a dictionary + of Table object with raw HTML elements for data and + headers. + + """ + + tables = {} + + for ind_table, table in enumerate(self._table_presents): + my_table = Table.from_html_element( + table=table, + table_name=f"table_{ind_table}", + caption_name_overwrite=True, + ) + + tables[my_table.name] = my_table + + return tables + + @classmethod + def from_html_string(cls, html_string, url=None): + """Instantiate an object from an html string. + + Args: + html_string (str): String representation of an html + url (str, optional): Url of the website the string is coming + from. Default to None + + Returns: + TableExtractor: The newly created TableExtractor + + """ + + return cls(html_string=html_string, url=url) + + @classmethod + def from_html_file(cls, html_file, url=None): + """Instantiate an object from an html file. + + Args: + html_file (str): relative filepath to an html file. + url (str, optional): Url of the website the file is coming + from. Default to None. + + Returns: + TableExtractor: The newly created TableExtractor + + """ + + with open(html_file, 'r') as infile: + html_string = infile.read() + + return cls(html_string=html_string, url=url) + + @classmethod + def from_url(cls, url, **kwargs): + """Instantiate an object from a url. + + Args: + url (str): Url of the website you are parsing. + + Returns: + TableExtractor: The newly created TableExtractor + + """ + + html_string = requests.get(url=url, **kwargs).text + return cls(html_string=html_string, url=url) + diff --git a/html2dict/extractors.py b/html2dict/extractors.py new file mode 100644 index 0000000..5ba5a93 --- /dev/null +++ b/html2dict/extractors.py @@ -0,0 +1,132 @@ +from html2dict.base_extractor import * + + +class BasicTableExtractor(TableExtractor): + """Basic tables extractor. + + Attributes: + html_string (str): String representation of an html. + url (str): Url of the website you are parsing + raw_tables (:obj:`dict` of :obj: `Table`): dict of all the tables + present on the page as raw HTML data and headers (
    & ). + basic_tables (:obj:`dict` of :obj:`Table`): dict of all the tables + present on the page as plaintext. + _tree (:obj:`HtmlElement`): Html tree from the root of the + provided html_string. + _table_presents (:obj:`list` of :obj:`dict`): List of tables + present in the html_string as html element . + + """ + + def __init__(self, html_string, url=None): + """__init_ method. + + Args: + html_string (str): String representation of an html. + url (str, optional): Url of the website you are parsing. + + """ + + super(BasicTableExtractor, self).__init__(html_string, url) + self.basic_tables = self.extract_basic_tables() + + @staticmethod + def basic_table_parser(table: Table): + """ Transform a raw table to a slightly more advanced table. + + Take a Table object containing raw HTML elements and extract + basic text data from it. + + Args: + table (:obj:`Table`): A Table object containing data as HTML + elements. + + Returns: + Table: A new Table object with its data represented in + plaintext. + + """ + header_rows = table.header_rows + data_rows = table.data_rows + tmp_header_rows = [] + tmp_data_rows = [] + + for row in header_rows + data_rows: + + tmp_row = [] + for cell in row: + + colspan = int(cell.attrib.get('colspan', 1)) + for _ in range(colspan): + + cell_text = get_text_content(cell=cell) + tmp_row.append(cell_text) + + if row in header_rows: + tmp_header_rows.append(tmp_row) + else: + tmp_data_rows.append(tmp_row) + + if not tmp_header_rows: + + tmp_data_rows = [ + {f"col_{ind}": item for ind, item in enumerate(row)} + for row in tmp_data_rows + ] + tmp_header_rows = sorted( + {header for row in tmp_data_rows for header in row} + ) + + else: + tmp_data_rows = [ + dict(zip(tmp_header_rows[0], row)) + for row in tmp_data_rows + ] + + return Table(data_rows=tmp_data_rows, header_rows=tmp_header_rows) + + def extract_basic_tables(self): + """Basic tables parser. + + Loop over the extracted raw_tables and pass them through the + basic_table_parser. + + Returns: + dict: All the tables found in the html string as a dictionary + of Table object with data and headers as plaintext. + + """ + + my_basic_tables = {} + for my_table in self.raw_tables: + try: + basic_table = BasicTableExtractor.basic_table_parser(self.raw_tables[my_table]) + except Exception as e: + error = """ + An error occured with {0}: + {1} + ***************** + Proceeding with next table + """.format(my_table, e) + print(error) + continue + + my_basic_tables[my_table] = basic_table + + return my_basic_tables + + +class RichTableExtractor(TableExtractor): + """ Rich tables extractor. + + Notes: + This class is not implemented yet but I am working on it. + The goal of it is to return more than just plaintext data. + For example if a cell contains an HTML list
  • , I should + retrieve it as a Python list or if a cell has a link, I should + retrieve something like [some_text](my_link). + + """ + + def __init__(self): + raise NotImplementedError("Placeholder class. Feature coming soon..") diff --git a/html2dict/resources.py b/html2dict/resources.py new file mode 100644 index 0000000..c62f9d4 --- /dev/null +++ b/html2dict/resources.py @@ -0,0 +1,156 @@ +class Table(object): + """Base table object. + + A Table object holds information about a table, including its name, + headers row and data rows. + + Attributes: + name (str): Name of the table. + header_rows (list): A list of headers. If the table doesn't + contains headers, default ones will be generated. + data_rows (list): Data rows of your table represented as a list + of dictionary. + rows (dict): Headers and data rows together in a dictionary. + + """ + + def __init__(self, data_rows: list, header_rows: list, name=None): + """__init__ method. + + Args: + name (str, optional): Name of the table. Default to None. + header_rows (list): A list of headers. + data_rows (list): Data rows of your table represented as a + list of dictionary. + + """ + + self.name = name + self.header_rows = header_rows + self.data_rows = data_rows + self.rows = { + "headers": self.header_rows, + "data": self.data_rows, + } + + @classmethod + def from_html_element(cls, table, table_name=None, caption_name_overwrite=False): + """Classmethod to extract a table from a
  • HTML element. + + This clasmethod is used by the Extractor class to extract the + tables on a webpage. + + Args: + table (:obj:`lxml.html.HtmlElement`): A
    HTML element. + table_name (str, optional): A table name. Defaults to None. + caption_name_overwrite (bool, optional): If True, if a table + name is provided but a table caption is found, the table + caption will be used as the name instead. + + Returns: + Table: A Table object + + """ + + header_rows = [] + data_rows = [] + + if table.xpath('caption'): + caption = table.xpath('caption')[0] + table_name = get_text_content(caption) + elif table_name and not caption_name_overwrite: + table_name = table_name + + t_body = table.xpath('*//tr') or table.xpath('tr') + + for row in t_body: + + if is_header(row): + header_rows.append(row) + else: + data_rows.append(row) + + return cls( + name=table_name, + data_rows=data_rows, + header_rows=header_rows, + ) + + def search(self, query, column=False): + """Search a value in your data rows. + + Search if a value is present anywhere in your table or in a + specific column. + + Args: + query : Value to search + column (str, optional): Column name. Search only in this + column. Default to None. + + Returns: + list: Rows containing the searched value. + + """ + + if column: + + try: + return [row for row in self.data_rows if query == row[column]] + except KeyError: + raise KeyError( + f"'{column}' is not a valid header. Valid headers are {self.header_rows}" + ) + + return [row for row in self.data_rows if query in row.values()] + + +def is_header(row): + """Check if an html row is a header. + + Args: + row (HtmlElement): An html row . + + Returns: + True if the row is only made of 'header' cells (
    ). + + """ + + return all([True if elem.tag == 'th' else False for elem in row.xpath('*')] or False) + + +def get_text_content(cell): + """Get the text content of an html cell + + Extract the text content of a cell in a html table. If the cell is part of a + merged header, join its text with a "/" with the text of the cell below it. + + Args: + cell (HtmlElement): Html cell or + cell_is_header (bool, optional): Is the cell a header. Default to False. + + Returns: + str: Text content at the root of an html cell. + + """ + + colspan = int(cell.attrib.get('colspan', 1)) + + cell_is_header = True if cell.tag == 'th' else False + cell_text = " ".join( + [i for i in cell.itertext() if i not in ('\\n',)]).strip() or "n/a" + + if (colspan > 1 or cell.attrib.get('Html2Dict_merged') == "True") and cell_is_header: + + cell.attrib['Html2Dict_merged'] = "True" + cell.attrib['colspan'] = str(colspan - 1) + next_cell_below = cell.getparent().getnext()[0] + cell.getparent().getnext().remove(next_cell_below) + + cell_text = "/".join([ + cell_text, + get_text_content(cell=next_cell_below) + ]) + + return cell_text + + return cell_text diff --git a/setup.py b/setup.py index ed1abb0..0a1be92 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ author_email=EMAIL, python_requires=REQUIRES_PYTHON, url=URL, - py_modules=['html2dict'], + packages=['html2dict'], install_requires=REQUIRED, license='MIT', classifiers=[ From 04c3fd71cdb4025a8c6dc0f6fdfe30ae37efe435 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 12:28:02 +0200 Subject: [PATCH 12/18] Update Docstring --- html2dict/resources.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2dict/resources.py b/html2dict/resources.py index c62f9d4..45b0d11 100644 --- a/html2dict/resources.py +++ b/html2dict/resources.py @@ -126,7 +126,6 @@ def get_text_content(cell): Args: cell (HtmlElement): Html cell or - cell_is_header (bool, optional): Is the cell a header. Default to False. Returns: str: Text content at the root of an html cell. From 71eef99073d3fde2c6a55141f965f17b099b4727 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 12:35:09 +0200 Subject: [PATCH 13/18] update README --- README.MD | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.MD b/README.MD index a15e677..0d0d065 100644 --- a/README.MD +++ b/README.MD @@ -54,20 +54,26 @@ my_extractor = BasicTableExtractor.from_url(url=) You can access the extracted tables from the basic_tables attribute. -For example for https://www.python.org/downloads/release/python-370/ - ```python my_extractor.basic_tables - -{'table_0': } ``` Finally, the data of the table can be accessed from the attributes data_rows or rows. ```python -my_extractor.basic_tables['table_0'] +my_extractor.basic_tables[].rows +``` + +For example for https://www.python.org/downloads/release/python-370/ + +```python +my_extractor = BasicTableExtractor.from_url(url="https://www.python.org/downloads/release/python-370/") +my_extractor.basic_tables + +{'table_0': } + +pprint(my_extractor.basic_tables['table_0'].rows) -pprint(test_html3.basic_tables['table_0'].rows) {'data': [{'Description': 'n/a', 'File Size': '22745726', 'GPG': 'SIG', @@ -141,5 +147,4 @@ pprint(test_html3.basic_tables['table_0'].rows) 'File Size', 'GPG']]} - ``` From db1dc8a88a70514fa06249a321b193856519c3e7 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sun, 12 Aug 2018 14:53:53 +0200 Subject: [PATCH 14/18] Add tests --- tests/__init__.py | 0 tests/simple_server.py | 40 ++++++++++ tests/test_data.json | 1 + tests/test_html2dict.py | 39 ++++++++++ tests/test_tables.html | 161 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 241 insertions(+) create mode 100644 tests/__init__.py create mode 100755 tests/simple_server.py create mode 100644 tests/test_data.json create mode 100644 tests/test_html2dict.py create mode 100644 tests/test_tables.html diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/simple_server.py b/tests/simple_server.py new file mode 100755 index 0000000..49dd543 --- /dev/null +++ b/tests/simple_server.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python3 + +import sys +from http.server import BaseHTTPRequestHandler, HTTPServer + + +TEST_HTML_FILE = sys.argv[1] +TEST_HTML_STRING = open(TEST_HTML_FILE, 'r').read() + + +class HTTPServer_RequestHandler(BaseHTTPRequestHandler): + + # GET + def do_GET(self): + # Send response status code + self.send_response(200) + + # Send headers + self.send_header('Content-type', 'text/html') + self.end_headers() + + # Send message back to client + message = TEST_HTML_STRING + # Write content as utf-8 data + self.wfile.write(bytes(message, "utf8")) + return + + +def start_server(): + + print('starting server...') + server_address = ('127.0.0.1', 8081) + httpd = HTTPServer(server_address, HTTPServer_RequestHandler) + print('running server...') + httpd.serve_forever() + + + +if __name__ == "__main__": + start_server() diff --git a/tests/test_data.json b/tests/test_data.json new file mode 100644 index 0000000..c4711ab --- /dev/null +++ b/tests/test_data.json @@ -0,0 +1 @@ +[[{"col_0": "a", "col_1": "b", "col_2": "c"}, {"col_0": "1", "col_1": "2", "col_2": "3"}, {"col_0": "x", "col_1": "y", "col_2": "z"}], [{"col_0": "Fruit", "col_1": "Color", "col_2": "Taste"}, {"col_0": "Strawberry", "col_1": "Red", "col_2": "Good"}, {"col_0": "Pear", "col_1": "Green", "col_2": "Bad"}], [{"Fruit": "Strawberry", "Color": "Red"}, {"Fruit": "Pear", "Color": "Green"}], [{"Fruit/Name": "Strawberry", "Fruit/Color": "Red"}, {"Fruit/Name": "Pear", "Fruit/Color": "Green"}], [{"Fruit/Name": "Strawberry", "Fruit/Color": "Red", "Vegetable/Name": "Brocoli", "Vegetable/Color": "Green", "Nut": "Cashew"}, {"Fruit/Name": "Pear", "Fruit/Color": "Green", "Vegetable/Name": "Radish", "Vegetable/Color": "Red", "Nut": "Peanut"}], [{"col_0": "a", "col_1": "b", "col_2": "c"}, {"col_0": "1", "col_1": "2", "col_2": "3"}]] \ No newline at end of file diff --git a/tests/test_html2dict.py b/tests/test_html2dict.py new file mode 100644 index 0000000..b8f9fd3 --- /dev/null +++ b/tests/test_html2dict.py @@ -0,0 +1,39 @@ +from html2dict.extractors import BasicTableExtractor +import subprocess +import json +import os + + +TEST_DATA_FOLDER = "tests" +TEST_HTML_FILE = os.path.join(TEST_DATA_FOLDER, "test_tables.html") +TEST_HTML_STRING = open(TEST_HTML_FILE, 'r').read() + +SIMPLE_SERVER = os.path.join(TEST_DATA_FOLDER, 'simple_server.py') +subprocess.Popen([SIMPLE_SERVER, TEST_HTML_FILE]) + +VALIDATION_FILE = os.path.join(TEST_DATA_FOLDER, 'test_data.json') +VALIDATION_DATA = json.load(open(VALIDATION_FILE, 'r')) + + +def test_basic_table_from_string(): + + test_html = BasicTableExtractor.from_html_string(TEST_HTML_STRING) + data_rows = [test_html.basic_tables[table].data_rows for table in test_html.basic_tables] + + assert data_rows == VALIDATION_DATA + + +def test_basic_table_from_file(): + + test_html = BasicTableExtractor.from_html_file(TEST_HTML_FILE) + data_rows = [test_html.basic_tables[table].data_rows for table in test_html.basic_tables] + + assert data_rows == VALIDATION_DATA + + +def test_basic_table_from_url(): + + test_html = BasicTableExtractor.from_url(url="http://127.0.0.1:8081") + data_rows = [test_html.basic_tables[table].data_rows for table in test_html.basic_tables] + + assert data_rows == VALIDATION_DATA diff --git a/tests/test_tables.html b/tests/test_tables.html new file mode 100644 index 0000000..94eb41b --- /dev/null +++ b/tests/test_tables.html @@ -0,0 +1,161 @@ + + + + + Title + + + +# most simple table, no caption + + + + + + + + + + + + + + + + +
    abc
    123
    xyz
    + + + + + + + + + + + + + + + + + + +
    Basic table, NO headers
    FruitColorTaste
    StrawberryRedGood
    PearGreenBad
    + + + + + + + + + + + + + + + + +
    Basic table, With headers
    FruitColor
    StrawberryRed
    PearGreen
    + + + + + + + + + + + + + + + + + + + + +
    Basic table, with merged headers
    Fruit
    NameColor
    StrawberryRed
    PearGreen
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Complex table with multiple merged headers
    FruitVegetableNut
    NameColorNameColor
    StrawberryRedBrocoliGreenCashew
    PearGreenRadishRedPeanut
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Complex table, with multiple merged headers on multiple levels.
    FoodSimple unmerged header
    FruitVegetable
    NameColorNameColor
    StrawberryRedBrocoliGreen'row #1'
    PearGreenRadishRed'row #2'
    + + + + + + + + + + + + +
    abc
    123
    + + + + \ No newline at end of file From 92e278c715c37ea7db45f42be83becbd2674083a Mon Sep 17 00:00:00 2001 From: B-Souty Date: Mon, 13 Aug 2018 23:42:19 +0200 Subject: [PATCH 15/18] Update README.MD --- README.MD | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.MD b/README.MD index 0d0d065..622a89c 100644 --- a/README.MD +++ b/README.MD @@ -36,7 +36,7 @@ This project is still very new, if the type of table you are parsing is not in t Start by importing the desired type of extractor. (Only one available currently). ```Python -from html2dict import BasicTableExtractor +from html2dict.exctractors import BasicTableExtractor ``` Then instantiate an object with one of the 3 constructors provided @@ -64,7 +64,9 @@ Finally, the data of the table can be accessed from the attributes data_rows or my_extractor.basic_tables[].rows ``` -For example for https://www.python.org/downloads/release/python-370/ +## Examples + +* for https://www.python.org/downloads/release/python-370/ ```python my_extractor = BasicTableExtractor.from_url(url="https://www.python.org/downloads/release/python-370/") From 7db4a247a505831096334decbdd6f93e993b9da7 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sat, 18 Aug 2018 20:30:08 +0200 Subject: [PATCH 16/18] Update README.MD Fix typo --- README.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.MD b/README.MD index 622a89c..130a026 100644 --- a/README.MD +++ b/README.MD @@ -36,7 +36,7 @@ This project is still very new, if the type of table you are parsing is not in t Start by importing the desired type of extractor. (Only one available currently). ```Python -from html2dict.exctractors import BasicTableExtractor +from html2dict.extractors import BasicTableExtractor ``` Then instantiate an object with one of the 3 constructors provided From 90b6d0b1874e96848080d2fddae3f483348ac911 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sat, 18 Aug 2018 20:43:14 +0200 Subject: [PATCH 17/18] Update version to 0.2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0a1be92..495633f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ EMAIL = 'benjamin.souty@gmail.com' AUTHOR = 'B-Souty' REQUIRES_PYTHON = '>=3.6.0' -VERSION = '0.1.1' +VERSION = '0.2' REQUIRED = [ 'lxml', From 833b8d2912ad02dc5833803295727dc327ad1b65 Mon Sep 17 00:00:00 2001 From: B-Souty Date: Sat, 18 Aug 2018 20:49:55 +0200 Subject: [PATCH 18/18] Update README.MD --- README.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.MD b/README.MD index 130a026..7889b34 100644 --- a/README.MD +++ b/README.MD @@ -15,7 +15,7 @@ Simple html tables extractor. ## Installing Create and activate a new Python virtual environment then install this dev branch with: - * `pip3 install git+https://github.com/B-Souty/html2dict@wip/issue2/main` + * `pip3 install html2dict` ## Capabilities