diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b632fa5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +data/* +files/* +!data/test_data.jsonl + +.vscode/ +bak/ +test_scripts/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f2f7908 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 TU/e and EPFL + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..1d81ddb --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# IRIS Virtual Patent Marking Pages Classifier +Tool to help a human being to classify a list of potential VPM pages into several possible categories. Part of the IRIS project. + +The classifier is written in Python, using the PyQt5 library. + +It creates a GUI browser that shows sequentially one of the detected pages. + +You can interact with the browser with the mouse and you can also use the numerical pad of the keyboard to select one of the categories. + +Once you have chosen the right category for a page the software moves to the next page. + +## Setup the classifier +The best is to +1. Install [Git](https://git-scm.com/) +2. Clone this repository with ``git clone https://gitlab.tue.nl/iris/iris-vpm-pages-classifier.git`` +3. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) +4. Create an environment with + * ``conda create -n iris-vpm-pages-classifier python=3.9`` + * ``conda activate iris-vpm-pages-classifier`` + * ``pip install -r requirements.txt`` + * ``pip install git+https://gitlab.tue.nl/iris/iris-utils.git`` +5. If you need to use the pre-classifier, you must also install a headless browser with the following command
+ ``playwright install chromium``
+ Note: the code has been tested with Chromium v857950 but the last version of the browser will be installed + +### GUI classifier on WSL2 +1. Install ``qt5-default`` on the WSL2 distro +2. Install X410 on Windows (the free alternatives did not work for me) and select ``Allow Public Access`` from its menu +3. Add the following lines into the ``~/.bashrc`` file of the WSL2 distro (before the bunch of code about Conda)
+``export DISPLAY=$(awk '/nameserver / {print $2; exit}' /etc/resolv.conf 2>/dev/null):0.0``
+Instead, do not add ``export LIBGL_ALWAYS_INDIRECT=1`` as adviced in many online guides. + +## Pre-processing +Before you start to classify the pages by hand, you must run ``pre-classify.py`` to automatically classify some pages. +This script will create a file with five main categories: cases that are (a) very likely true positives; (b) very likely false positives; (c) maybe positive; (d) maybe negative; (e) unknown. + +The first two cases are automatically classified. For the second two, a hint is provided and the person is required to choose if the page is actually a VPM page or not. The last case is left to the person, without any hint. + +To use it you need a bunch of software that is as easy to install on GNU/Linux as hard to have on MS-Windows. The advice is, therefore, to use a GNU/Linux machine (the instructions that follow are for Debian GNU/Linux) or use WSL2 (to run the GUI classifier from WSL2 is not trivial but possible; follow the instructions here below). +1. Install [Tesseract](https://tesseract-ocr.github.io/) with
+``sudo apt install tesseract-ocr`` +2. Install [Poppler](https://poppler.freedesktop.org/)
+``sudo apt install poppler-utils`` + +To run the automatic classifier, please run
+``python pre-classify.py -I data/scraping_results.jsonl data/websites_to_exclude.txt -o data/pre_classified.jsonl`` + +## Populate the database +Once the data have been analyzed by the pre-classifier, you must use its output to populate a database that will be used by the classifier. To do so, please run
+``python write-database.py -I data/scraping_results.jsonl data/pre_classified.jsonl -o data/database.json`` + +If you want to split the data in sub-databased, so that more than one person can have her/his own data to classify, you can run
+``python write-database.py -I data/scraping_results.jsonl data/pre_classified.jsonl -o data/database.json -O N``
+where ``N`` is the number of files that you want to generate. + +Note: you cannot overwrite the database once created (you can only update it, if not using the specific commands of [Flata](https://github.com/harryho/flata)). If you want to do so, you must delete the written files and re-run the script. + +## Run the classifier +1. Remember, each time, to activate the conda environment created in the setup phase with ``conda activate iris-vpm-pages-classifier`` +2. Run ``python classify.py -i data/database.json`` + +## Acknowledgements +The authors thank the EuroTech Universities Alliance for sponsoring this work. Carlo Bottai was supported by the European Union's Marie Skłodowska-Curie programme for the project Insights on the "Real Impact" of Science (H2020 MSCA-COFUND-2016 Action, Grant Agreement No 754462). diff --git a/classify.py b/classify.py new file mode 100644 index 0000000..b818e89 --- /dev/null +++ b/classify.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python + +""" +Tool to help a human being to classify the scraped VPM pages + into several categories + +It creates a GUI browser that shows sequentially one of the detected pages. + You can interact with the browser with the mouse and you can also use the + numerical pad of the keyboard to select one of the categories. Once you + have chosen the right category for a page the software moves to the next. + +Author: Carlo Bottai +Copyright (c) 2020 - TU/e and EPFL +License: See the LICENSE file. +Date: 2020-10-16 + +""" + +from PyQt5.QtCore import * +from PyQt5.QtWidgets import * +from PyQt5.QtGui import * +from PyQt5.QtWebEngineWidgets import * +import qtawesome as qta +import sys +import webbrowser +from flata import Flata, Query, JSONStorage +import requests +from iris_utils.parse_args import parse_io + + +USER_AGENT = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) ' + 'Gecko/2009021910 Firefox/3.0.7') + + +class MainWindow(QMainWindow): + def __init__(self, *args, **kwargs): + super(MainWindow, self).__init__(*args, **kwargs) + + args = parse_io() + self.f_in = args.input + + self.read_data() + + self.view = QWebEngineView() + self.view.settings() \ + .setAttribute(QWebEngineSettings.PluginsEnabled, True) + self.setCentralWidget(self.view) + + self.status = QStatusBar() + self.setStatusBar(self.status) + + navtb = QToolBar('Navigation') + self.addToolBar(navtb) + + back_btn = QAction(qta.icon('fa5s.arrow-left'), 'Back', self) + back_btn.triggered.connect(lambda: self.view.back()) + navtb.addAction(back_btn) + + next_btn = QAction(qta.icon('fa5s.arrow-right'), 'Forward', self) + next_btn.triggered.connect(lambda: self.view.forward()) + navtb.addAction(next_btn) + + navtb.addSeparator() + + self.urlbar = QLineEdit() + self.urlbar.returnPressed.connect(self.go_to_url) + navtb.addWidget(self.urlbar) + + navtb.addSeparator() + + reload_btn = QAction(qta.icon('fa5s.redo'), 'Reload', self) + reload_btn.triggered.connect(lambda: self.view.reload()) + navtb.addAction(reload_btn) + + stop_btn = QAction(qta.icon('fa5s.stop'), 'Stop', self) + stop_btn.triggered.connect(lambda: self.view.stop()) + navtb.addAction(stop_btn) + + open_btn = QAction( + qta.icon('fa5s.external-link-square-alt'), 'Open', self) + open_btn.triggered.connect(lambda: \ + webbrowser.open_new_tab(self.urlbar.text())) + navtb.addAction(open_btn) + + labtb = QToolBar('Labeling') + self.addToolBar(Qt.RightToolBarArea, labtb) + + for name, idx in [ + ('VPM page | True patent-product link', 1), + ('Brochure or description of the product | True patent-product link', 2), + ('Hybrid document | True patent-product link', 3), + ('List of patents or metadata of a patent | False patent-product link', 4), + ('A scientific publication | False patent-product link', 5), + ('News about the patent | False patent-product link', 6), + ('CV/resume | False patent-product link', 7), + ('Something else in a website to keep | False patent-product link', 8), + ('Something else in a website to exclude | False patent-product link', 9), + ('The document is unreachable | False patent-product link', 0)]: + label = QAction(f'{name} ({idx})', self) + label.setShortcut(str(idx)) + label.triggered.connect(lambda checked, lbl=name: self.label_page(lbl)) + labtb.addAction(label) + + #labtb.addSeparator() + + urls_len_lbl = f'{self.data_to_classify_len} URLs left to classify' + self.status.showMessage(urls_len_lbl) + + self.open_next_page() + + self.show() + + self.setWindowTitle('VPM pages handmade classifier') + + def read_data(self): + DB = Flata(self.f_in, storage=JSONStorage) + self.database = DB.table('iris_vpm_pages_classifier') + + to_classify = \ + (Query().vpm_page_classification==None) & \ + (Query().vpm_page!=None) + self.data_to_classify = iter(self.database.search(to_classify)) + self.data_to_classify_len = self.database.count(to_classify) + + def go_to_url(self, url=None): + if url is None: + url = self.urlbar.text() + else: + self.urlbar.setText(url) + self.urlbar.setCursorPosition(0) + + try: + response = requests.head( + url, + headers={'User-Agent': USER_AGENT}, + verify=False, + allow_redirects=True, + timeout=10) + headers = response.headers + content_type = headers['Content-Type'] + if 'Content-Disposition' in headers: + content_disposition = headers['Content-Disposition'] + else: + content_disposition = '' + if not (content_type.startswith('text/html') or \ + content_type.startswith('application/pdf') or \ + content_type.startswith('text/plain')) or \ + content_disposition.startswith('attachment'): + self.msgBox = QMessageBox.about( + self, + 'Additional information (DOWNLOAD)', + ('It is possible that it is needed to download the next ' + 'document.\nIf you do not see the page changing, try to ' + 'open the page in a browser by clicking on ' + 'the appropriate button')) + except: + pass + + url = QUrl(url) + + if url.scheme() == '': + url.setScheme('https') + + self.view.setUrl(url) + + def open_next_page(self): + try: + self.current_data = next(self.data_to_classify) + while self.current_data['vpm_page_classification']: + self.current_data = next(self.data_to_classify) + + INFO_MSG = { + 'COPYRIGHT': + ('The information about the patent(s) has been ' + 'detected close to the copyright information ' + 'at the bottom of the document.\n' + 'Please, confirm whether or not there is a link ' + 'between a patent and a product in this document'), + 'NOCORPUS': + ('No information about any of the patents has been ' + 'detected in the document.\nPlease, confirm whether ' + 'or not there is a link between a patent and a ' + 'product in this document'), + 'NOCORPUS+IMG': + ('The only information about the patent(s) ' + 'has been detected in one of the pictures ' + 'of the document.\nPlease, confirm whether ' + 'or not there is a link between a patent ' + 'and a product in this document'), + 'NOCORPUS+PATNUMINURL': + ('The only information about the patent(s) ' + 'has been detected in the URL ' + 'of the document.\nPlease, confirm whether ' + 'or not there is a link between a patent ' + 'and a product in this document')} + vpm_page_automatic_classification = self.current_data[ + 'vpm_page_automatic_classification'] + vpm_page_automatic_classification_info = \ + vpm_page_automatic_classification \ + .split(' | ')[1] + if vpm_page_automatic_classification_info in INFO_MSG.keys(): + vpm_page_automatic_classification_msg = INFO_MSG[ + vpm_page_automatic_classification_info] + self.msgBox = QMessageBox.about( + self, + f'Additional information ({vpm_page_automatic_classification_info})', + vpm_page_automatic_classification_msg) + + print('\n+++++++++++++++++++++++++++') + print(f"Patent assignee: {self.current_data['patent_assignee']}") + try: + print(f"Award recipient: {self.current_data['award_recipient']}") + except Exception: + pass + print(f"Patents: {self.current_data['patent_id']}") + print('+++++++++++++++++++++++++++\n') + + url = self.current_data['vpm_page'] + self.go_to_url(url) + + except: + print('\n+++++++++++++++++++++++++++') + print('No other pages left. Well done!') + print('+++++++++++++++++++++++++++\n') + self.close() + + def label_page(self, label): + updated_info = self.database.update( + {'vpm_page_classification': label}, + Query().vpm_page==self.current_data['vpm_page']) + updated_ids = updated_info[0] + + # Reduce the number of pages left by one + # and show this information in the status bar + self.data_to_classify_len -= len(updated_ids) + urls_len_lbl = f'{self.data_to_classify_len} URLs left to classify' + self.status.showMessage(urls_len_lbl) + + self.open_next_page() + self.update() + +if __name__ == "__main__": + app = QApplication(sys.argv) + app.setApplicationName('VPM pages handmade classifier') + window = MainWindow() + app.exec_() + diff --git a/data/test_data.jsonl b/data/test_data.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/post-classify.py b/post-classify.py new file mode 100644 index 0000000..3ff25b1 --- /dev/null +++ b/post-classify.py @@ -0,0 +1,671 @@ +#!/usr/bin/env python + +""" +Tool to post-process the output of the classification phase. + +For each page identified as a "true VPM page" the script checks + which patents, among the possible ones for that specific page, + are actually present in the page and which not. +It returns, for each entry of the database, a JSON line of the type + {'vpm_page': 'URL_OF_THE_PAGE', + 'is_true_vpm_page': true/false, + 'is_patent_in_page': [(PATENT_NUMBER: true/false), + (PATENT_NUMBER: true/false)]} + +Author: Carlo Bottai +Copyright (c) 2021 - TU/e and EPFL +License: See the LICENSE file. +Date: 2021-05-08 + +""" + + +## LIBRARIES ## + +import numpy as np +import pandas as pd +import os +import pathlib +from io import BytesIO +from hashlib import md5 +from nltk.tokenize import sent_tokenize +import re +from urllib.parse import urlparse +from os.path import splitext +from datetime import datetime +import json + +from flata import Flata, JSONStorage + +from bs4 import BeautifulSoup as beautiful_soup +import html5lib + +import pdfminer.high_level as pdfminer +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +import pdf2image +import pytesseract + +from striprtf.striprtf import rtf_to_text + +import asyncio +import aiofiles + +from aiohttp import ClientSession, BadContentDispositionHeader + +from tqdm.asyncio import tqdm as aio_tqdm +import warnings + +from iris_utils.parse_args import parse_io + + +## TYPE HINTS ## + +from typing import List, Tuple, Set, TypedDict +from pathlib import PosixPath +from flata.database import Table as fa_Table +class LineDict(TypedDict): + db_id: int + vpm_page: str + patent_ids: int + + +## WARNINGS SUPPRESSION ## + +# Suppress PDF text extraction not allowed warning +# and any other warning from the `pdfminer` module +warnings.filterwarnings('ignore', module = 'pdfminer') + +# Suppress BadContentDispositionHeader warning +# from the `aiohttp` module +warnings.simplefilter('ignore', BadContentDispositionHeader) + + +################# +# SETTINGS # +################# + +# Name of the filder where the local copy of the pages have been saved +files_folder = 'files' + +# User agent +# Useful for both the type of documents (HTML and others) considered in the script +USER_AGENT = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) ' + 'Gecko/2009021910 Firefox/3.0.7') + +# Length of the texts in the corpus of each document +# Number of characters, extracted from the full text of each document, +# before and after the keywords defined afterward +CONTEXT_SPAN = 500 + +# Choose the name of the log file where eventual errors will be reported +# The file will have a name like pre_classify_%Y_%m_%d_%H_%M.log +LOG_FILE = 'post_classify' + +## ASYNCIO SETTINGS ## + +# Run no more than 25 tasks at a time +NUM_CONCURRENT_TASKS = 25 +SEMAPHORE = asyncio.Semaphore(NUM_CONCURRENT_TASKS) + +################ +# REGEX # +################ + +# Punctuation characters that will be removed +PUNCT_RE = re.compile(r'[\n\f\r\t\x0A\x0C\x0D\x09\s]+') + +# Regular expressions used to convert a URL into a file name +HTTPWWW_RE = re.compile(r'^(.*:\/\/)?(www\.)?', flags = re.IGNORECASE) +NOALPHA_RE = re.compile(r'\W') + +# Regular expression used to remove the sentences about cookie or privacy policy +# Useful to remove useless portions of the headers and footers +COOKIE_RE = re.compile(r'(cookie)|(privacy policy)', flags=re.IGNORECASE) + +# Regular expression +PATNUM_RE = re.compile(r'\d{1,2},?\d{3},?\d{3}') + + +################# +# FUNCTIONS # +################# + +def read_input(f_in: str) -> fa_Table: + """ + Read the input file + """ + + DB = Flata(f_in, storage=JSONStorage) + database = DB.table('iris_vpm_pages_classifier') + + return database + +def generate_file_name(url: str) -> str: + """ + Given the URL provided, return a standardized file name + """ + + # Remove 'https://', 'ftp://' and similar things, and remove 'www' + file_name = HTTPWWW_RE.sub('', url) + + # Replace any non-alphanumeric chars with '_' + file_name = NOALPHA_RE.sub('_', file_name) + + # If the generated filename is longer than 250 bytes + # (i.e., about the lenght-limit for an ext4 file system), + # then use as name an hash hexdigest string + if len(file_name.encode()) >= 250: + file_name = md5(file_name.encode()).hexdigest() + + return file_name + +def which_content_type_exists(file_path: str) -> str: + """ + Returns the content type based on which file exists locally + Returns None if no file exists for the document of interest + """ + for content_type in ['html', 'txt', 'rtf', 'pdf', 'other']: + # NB PDF must always be the last one, since also HTML contents + # have a PDF version (and potentialy other types + # will do the same in the future) + type_path = file_path.replace('.pdf', f'.{content_type}') + if os.path.exists(type_path): + return content_type.upper() + return None + +async def get_content_type(url: str, file_path: str, requests_session: ClientSession) -> str: + """ + Determine the type of content returned by a GET request to the URL provided + The possible answers are: + - HTML, PDF, TXT (documents handled by the script) + - OTHER (documents unhandled by the script) + - FAILED (generic error while connecting with the remote source) + """ + + local_content_type = which_content_type_exists( + file_path = file_path) + if local_content_type: + return local_content_type + + # If the URL names a file that ends in *.pdf (*.txt) its a PDF (TXT) + url_path = urlparse(url).path + url_root, url_ext = splitext(url_path.lower()) + if url_ext.endswith('pdf'): + return 'PDF' + if url_ext.endswith('txt'): + return 'TXT' + + try: + # Require the HEAD for the URL + response = await requests_session.request( + method = 'HEAD', + url = url, + headers = {'User-Agent': USER_AGENT}, + allow_redirects = True, + ssl = False) + + # assert response.status in [200, 403] + + # Take the content-type from the HEAD + remote_content_type = response.content_type + + except: + return 'FAILED' + + # Is the content-type a PDF? + if remote_content_type and remote_content_type.startswith('application/pdf'): + return 'PDF' + + # Is the content-type an RTF? + if remote_content_type and remote_content_type.startswith('application/rtf'): + return 'RTF' + + # Is the content-type a plain text? + if remote_content_type and remote_content_type.startswith('text/plain'): + return 'TXT' + + # Is the content-type a stream of data? + if remote_content_type and remote_content_type.startswith('application/octet-stream'): + try: + # Take the content-disposition from the HEAD + content_disposition = response.content_disposition + # Take the filename field from the content-disposition + content_disposition = re.search(r'filename = "(.*)"', content_disposition) + except: + return 'FAILED' + # Is the file a PDF? + if content_disposition and \ + any([splitext(group.lower())[1].endswith('pdf') \ + for group in content_disposition.groups()]): + return 'PDF' + if content_disposition and \ + any([splitext(group.lower())[1].endswith('rtf') \ + for group in content_disposition.groups()]): + return 'RTF' + # Is the file a TXT? + if content_disposition and \ + any([splitext(group.lower())[1].endswith('txt') \ + for group in content_disposition.groups()]): + return 'TXT' + # Is the file something else? + else: + return 'OTHER' + + # Is the content-type an HTML? + if remote_content_type and remote_content_type.startswith('text/html'): + return 'HTML' + + # Is the content-type something else? + return 'OTHER' + +async def get_content_from_url(url: str, requests_session: ClientSession) -> bytes: + """ + Download the document from the URL provided, store it locally and return it + """ + + try: + # Download the content from the URL + response = await requests_session.request( + method = 'GET', + url = url, + headers = {'User-Agent': USER_AGENT}, + allow_redirects = True, + ssl = False) + assert response.status == 200 + except: + text_bytes = b'' + else: + # Read the downloaded content + try: + text_bytes = await response.read() + except: + text_bytes = b'' + + # Return the content + return text_bytes + +async def get_text_from_txt(url: str, file_path: str, requests_session: ClientSession) -> str: + """ + Extract the text from the TXT file provided (or downloaded from the URL provided) + """ + + if os.path.exists(file_path): + with open(file_path, 'rb') as f_in: + text_bytes = f_in.read() + else: + text_bytes = await get_content_from_url( + url = url, + requests_session = requests_session) + text = text_bytes.decode(errors='ignore') + + return text + +async def get_text_from_pdf(url: str, file_path: str, requests_session: ClientSession, use_ocr = False) -> str: + """ + Extract the text from the PDF file provided (or downloaded from the URL provided) + """ + + if os.path.exists(file_path): + with open(file_path, 'rb') as f_in: + text_bytes = f_in.read() + else: + text_bytes = await get_content_from_url( + url = url, + requests_session = requests_session) + + if use_ocr: + try: + pdf_parser = PDFParser(BytesIO(text_bytes)) + pdf = PDFDocument(pdf_parser) + n_pages = pdf.catalog['Pages'].resolve()['Count'] + # Analyze the document only if it is shorter than 30 pages + if n_pages<30: + pages = pdf2image.convert_from_bytes(text_bytes, grayscale = True) + + text = '' + for page in pages: + page = pytesseract.image_to_string(page, lang = 'eng') + text += page + except: + text = '' + else: + try: + text = pdfminer.extract_text(BytesIO(text_bytes)) + except: + text = '' + + return text + +async def get_text_from_rtf(url: str, file_path: str, requests_session: ClientSession) -> str: + """ + Extract the text from the RTF file provided (or downloaded from the URL provided) + """ + + if os.path.exists(file_path): + with open(file_path, 'rb') as f_in: + text_bytes = f_in.read() + else: + text_bytes = await get_content_from_url( + url = url, + requests_session = requests_session) + + try: + text = text_bytes.decode(errors='ignore') + text = rtf_to_text(text) + except: + text = '' + + return text + +async def get_text_from_html(url: str, file_path: str, requests_session: ClientSession) -> Tuple[str, List[str]]: + """ + Extract the text from the body of the document, + using the local version of the website (or try to create one) + """ + + html_path = file_path.replace('.pdf', '.html') + + # Use the, previously stored, local HTML version of the URL, if exists + try: + if html_path and os.path.exists(html_path): + with open(html_path, 'r') as f_in: + html_soup = beautiful_soup(f_in, 'html5lib') + else: + text_bytes = await get_content_from_url( + url = url, + requests_session = requests_session) + html_soup = beautiful_soup(text_bytes, 'html5lib') + except: + text = '' + else: + try: + # Remove