diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b632fa5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+data/*
+files/*
+!data/test_data.jsonl
+
+.vscode/
+bak/
+test_scripts/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f2f7908
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 TU/e and EPFL
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1d81ddb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,63 @@
+# IRIS Virtual Patent Marking Pages Classifier
+Tool to help a human being to classify a list of potential VPM pages into several possible categories. Part of the IRIS project.
+
+The classifier is written in Python, using the PyQt5 library.
+
+It creates a GUI browser that shows sequentially one of the detected pages.
+
+You can interact with the browser with the mouse and you can also use the numerical pad of the keyboard to select one of the categories.
+
+Once you have chosen the right category for a page the software moves to the next page.
+
+## Setup the classifier
+The best is to
+1. Install [Git](https://git-scm.com/)
+2. Clone this repository with ``git clone https://gitlab.tue.nl/iris/iris-vpm-pages-classifier.git``
+3. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+4. Create an environment with
+ * ``conda create -n iris-vpm-pages-classifier python=3.9``
+ * ``conda activate iris-vpm-pages-classifier``
+ * ``pip install -r requirements.txt``
+ * ``pip install git+https://gitlab.tue.nl/iris/iris-utils.git``
+5. If you need to use the pre-classifier, you must also install a headless browser with the following command
+ ``playwright install chromium``
+ Note: the code has been tested with Chromium v857950 but the last version of the browser will be installed
+
+### GUI classifier on WSL2
+1. Install ``qt5-default`` on the WSL2 distro
+2. Install X410 on Windows (the free alternatives did not work for me) and select ``Allow Public Access`` from its menu
+3. Add the following lines into the ``~/.bashrc`` file of the WSL2 distro (before the bunch of code about Conda)
+``export DISPLAY=$(awk '/nameserver / {print $2; exit}' /etc/resolv.conf 2>/dev/null):0.0``
+Instead, do not add ``export LIBGL_ALWAYS_INDIRECT=1`` as adviced in many online guides.
+
+## Pre-processing
+Before you start to classify the pages by hand, you must run ``pre-classify.py`` to automatically classify some pages.
+This script will create a file with five main categories: cases that are (a) very likely true positives; (b) very likely false positives; (c) maybe positive; (d) maybe negative; (e) unknown.
+
+The first two cases are automatically classified. For the second two, a hint is provided and the person is required to choose if the page is actually a VPM page or not. The last case is left to the person, without any hint.
+
+To use it you need a bunch of software that is as easy to install on GNU/Linux as hard to have on MS-Windows. The advice is, therefore, to use a GNU/Linux machine (the instructions that follow are for Debian GNU/Linux) or use WSL2 (to run the GUI classifier from WSL2 is not trivial but possible; follow the instructions here below).
+1. Install [Tesseract](https://tesseract-ocr.github.io/) with
+``sudo apt install tesseract-ocr``
+2. Install [Poppler](https://poppler.freedesktop.org/)
+``sudo apt install poppler-utils``
+
+To run the automatic classifier, please run
+``python pre-classify.py -I data/scraping_results.jsonl data/websites_to_exclude.txt -o data/pre_classified.jsonl``
+
+## Populate the database
+Once the data have been analyzed by the pre-classifier, you must use its output to populate a database that will be used by the classifier. To do so, please run
+``python write-database.py -I data/scraping_results.jsonl data/pre_classified.jsonl -o data/database.json``
+
+If you want to split the data in sub-databased, so that more than one person can have her/his own data to classify, you can run
+``python write-database.py -I data/scraping_results.jsonl data/pre_classified.jsonl -o data/database.json -O N``
+where ``N`` is the number of files that you want to generate.
+
+Note: you cannot overwrite the database once created (you can only update it, if not using the specific commands of [Flata](https://github.com/harryho/flata)). If you want to do so, you must delete the written files and re-run the script.
+
+## Run the classifier
+1. Remember, each time, to activate the conda environment created in the setup phase with ``conda activate iris-vpm-pages-classifier``
+2. Run ``python classify.py -i data/database.json``
+
+## Acknowledgements
+The authors thank the EuroTech Universities Alliance for sponsoring this work. Carlo Bottai was supported by the European Union's Marie Skłodowska-Curie programme for the project Insights on the "Real Impact" of Science (H2020 MSCA-COFUND-2016 Action, Grant Agreement No 754462).
diff --git a/classify.py b/classify.py
new file mode 100644
index 0000000..b818e89
--- /dev/null
+++ b/classify.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+
+"""
+Tool to help a human being to classify the scraped VPM pages
+ into several categories
+
+It creates a GUI browser that shows sequentially one of the detected pages.
+ You can interact with the browser with the mouse and you can also use the
+ numerical pad of the keyboard to select one of the categories. Once you
+ have chosen the right category for a page the software moves to the next.
+
+Author: Carlo Bottai
+Copyright (c) 2020 - TU/e and EPFL
+License: See the LICENSE file.
+Date: 2020-10-16
+
+"""
+
+from PyQt5.QtCore import *
+from PyQt5.QtWidgets import *
+from PyQt5.QtGui import *
+from PyQt5.QtWebEngineWidgets import *
+import qtawesome as qta
+import sys
+import webbrowser
+from flata import Flata, Query, JSONStorage
+import requests
+from iris_utils.parse_args import parse_io
+
+
+USER_AGENT = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) '
+ 'Gecko/2009021910 Firefox/3.0.7')
+
+
+class MainWindow(QMainWindow):
+ def __init__(self, *args, **kwargs):
+ super(MainWindow, self).__init__(*args, **kwargs)
+
+ args = parse_io()
+ self.f_in = args.input
+
+ self.read_data()
+
+ self.view = QWebEngineView()
+ self.view.settings() \
+ .setAttribute(QWebEngineSettings.PluginsEnabled, True)
+ self.setCentralWidget(self.view)
+
+ self.status = QStatusBar()
+ self.setStatusBar(self.status)
+
+ navtb = QToolBar('Navigation')
+ self.addToolBar(navtb)
+
+ back_btn = QAction(qta.icon('fa5s.arrow-left'), 'Back', self)
+ back_btn.triggered.connect(lambda: self.view.back())
+ navtb.addAction(back_btn)
+
+ next_btn = QAction(qta.icon('fa5s.arrow-right'), 'Forward', self)
+ next_btn.triggered.connect(lambda: self.view.forward())
+ navtb.addAction(next_btn)
+
+ navtb.addSeparator()
+
+ self.urlbar = QLineEdit()
+ self.urlbar.returnPressed.connect(self.go_to_url)
+ navtb.addWidget(self.urlbar)
+
+ navtb.addSeparator()
+
+ reload_btn = QAction(qta.icon('fa5s.redo'), 'Reload', self)
+ reload_btn.triggered.connect(lambda: self.view.reload())
+ navtb.addAction(reload_btn)
+
+ stop_btn = QAction(qta.icon('fa5s.stop'), 'Stop', self)
+ stop_btn.triggered.connect(lambda: self.view.stop())
+ navtb.addAction(stop_btn)
+
+ open_btn = QAction(
+ qta.icon('fa5s.external-link-square-alt'), 'Open', self)
+ open_btn.triggered.connect(lambda: \
+ webbrowser.open_new_tab(self.urlbar.text()))
+ navtb.addAction(open_btn)
+
+ labtb = QToolBar('Labeling')
+ self.addToolBar(Qt.RightToolBarArea, labtb)
+
+ for name, idx in [
+ ('VPM page | True patent-product link', 1),
+ ('Brochure or description of the product | True patent-product link', 2),
+ ('Hybrid document | True patent-product link', 3),
+ ('List of patents or metadata of a patent | False patent-product link', 4),
+ ('A scientific publication | False patent-product link', 5),
+ ('News about the patent | False patent-product link', 6),
+ ('CV/resume | False patent-product link', 7),
+ ('Something else in a website to keep | False patent-product link', 8),
+ ('Something else in a website to exclude | False patent-product link', 9),
+ ('The document is unreachable | False patent-product link', 0)]:
+ label = QAction(f'{name} ({idx})', self)
+ label.setShortcut(str(idx))
+ label.triggered.connect(lambda checked, lbl=name: self.label_page(lbl))
+ labtb.addAction(label)
+
+ #labtb.addSeparator()
+
+ urls_len_lbl = f'{self.data_to_classify_len} URLs left to classify'
+ self.status.showMessage(urls_len_lbl)
+
+ self.open_next_page()
+
+ self.show()
+
+ self.setWindowTitle('VPM pages handmade classifier')
+
+ def read_data(self):
+ DB = Flata(self.f_in, storage=JSONStorage)
+ self.database = DB.table('iris_vpm_pages_classifier')
+
+ to_classify = \
+ (Query().vpm_page_classification==None) & \
+ (Query().vpm_page!=None)
+ self.data_to_classify = iter(self.database.search(to_classify))
+ self.data_to_classify_len = self.database.count(to_classify)
+
+ def go_to_url(self, url=None):
+ if url is None:
+ url = self.urlbar.text()
+ else:
+ self.urlbar.setText(url)
+ self.urlbar.setCursorPosition(0)
+
+ try:
+ response = requests.head(
+ url,
+ headers={'User-Agent': USER_AGENT},
+ verify=False,
+ allow_redirects=True,
+ timeout=10)
+ headers = response.headers
+ content_type = headers['Content-Type']
+ if 'Content-Disposition' in headers:
+ content_disposition = headers['Content-Disposition']
+ else:
+ content_disposition = ''
+ if not (content_type.startswith('text/html') or \
+ content_type.startswith('application/pdf') or \
+ content_type.startswith('text/plain')) or \
+ content_disposition.startswith('attachment'):
+ self.msgBox = QMessageBox.about(
+ self,
+ 'Additional information (DOWNLOAD)',
+ ('It is possible that it is needed to download the next '
+ 'document.\nIf you do not see the page changing, try to '
+ 'open the page in a browser by clicking on '
+ 'the appropriate button'))
+ except:
+ pass
+
+ url = QUrl(url)
+
+ if url.scheme() == '':
+ url.setScheme('https')
+
+ self.view.setUrl(url)
+
+ def open_next_page(self):
+ try:
+ self.current_data = next(self.data_to_classify)
+ while self.current_data['vpm_page_classification']:
+ self.current_data = next(self.data_to_classify)
+
+ INFO_MSG = {
+ 'COPYRIGHT':
+ ('The information about the patent(s) has been '
+ 'detected close to the copyright information '
+ 'at the bottom of the document.\n'
+ 'Please, confirm whether or not there is a link '
+ 'between a patent and a product in this document'),
+ 'NOCORPUS':
+ ('No information about any of the patents has been '
+ 'detected in the document.\nPlease, confirm whether '
+ 'or not there is a link between a patent and a '
+ 'product in this document'),
+ 'NOCORPUS+IMG':
+ ('The only information about the patent(s) '
+ 'has been detected in one of the pictures '
+ 'of the document.\nPlease, confirm whether '
+ 'or not there is a link between a patent '
+ 'and a product in this document'),
+ 'NOCORPUS+PATNUMINURL':
+ ('The only information about the patent(s) '
+ 'has been detected in the URL '
+ 'of the document.\nPlease, confirm whether '
+ 'or not there is a link between a patent '
+ 'and a product in this document')}
+ vpm_page_automatic_classification = self.current_data[
+ 'vpm_page_automatic_classification']
+ vpm_page_automatic_classification_info = \
+ vpm_page_automatic_classification \
+ .split(' | ')[1]
+ if vpm_page_automatic_classification_info in INFO_MSG.keys():
+ vpm_page_automatic_classification_msg = INFO_MSG[
+ vpm_page_automatic_classification_info]
+ self.msgBox = QMessageBox.about(
+ self,
+ f'Additional information ({vpm_page_automatic_classification_info})',
+ vpm_page_automatic_classification_msg)
+
+ print('\n+++++++++++++++++++++++++++')
+ print(f"Patent assignee: {self.current_data['patent_assignee']}")
+ try:
+ print(f"Award recipient: {self.current_data['award_recipient']}")
+ except Exception:
+ pass
+ print(f"Patents: {self.current_data['patent_id']}")
+ print('+++++++++++++++++++++++++++\n')
+
+ url = self.current_data['vpm_page']
+ self.go_to_url(url)
+
+ except:
+ print('\n+++++++++++++++++++++++++++')
+ print('No other pages left. Well done!')
+ print('+++++++++++++++++++++++++++\n')
+ self.close()
+
+ def label_page(self, label):
+ updated_info = self.database.update(
+ {'vpm_page_classification': label},
+ Query().vpm_page==self.current_data['vpm_page'])
+ updated_ids = updated_info[0]
+
+ # Reduce the number of pages left by one
+ # and show this information in the status bar
+ self.data_to_classify_len -= len(updated_ids)
+ urls_len_lbl = f'{self.data_to_classify_len} URLs left to classify'
+ self.status.showMessage(urls_len_lbl)
+
+ self.open_next_page()
+ self.update()
+
+if __name__ == "__main__":
+ app = QApplication(sys.argv)
+ app.setApplicationName('VPM pages handmade classifier')
+ window = MainWindow()
+ app.exec_()
+
diff --git a/data/test_data.jsonl b/data/test_data.jsonl
new file mode 100644
index 0000000..e69de29
diff --git a/post-classify.py b/post-classify.py
new file mode 100644
index 0000000..3ff25b1
--- /dev/null
+++ b/post-classify.py
@@ -0,0 +1,671 @@
+#!/usr/bin/env python
+
+"""
+Tool to post-process the output of the classification phase.
+
+For each page identified as a "true VPM page" the script checks
+ which patents, among the possible ones for that specific page,
+ are actually present in the page and which not.
+It returns, for each entry of the database, a JSON line of the type
+ {'vpm_page': 'URL_OF_THE_PAGE',
+ 'is_true_vpm_page': true/false,
+ 'is_patent_in_page': [(PATENT_NUMBER: true/false),
+ (PATENT_NUMBER: true/false)]}
+
+Author: Carlo Bottai
+Copyright (c) 2021 - TU/e and EPFL
+License: See the LICENSE file.
+Date: 2021-05-08
+
+"""
+
+
+## LIBRARIES ##
+
+import numpy as np
+import pandas as pd
+import os
+import pathlib
+from io import BytesIO
+from hashlib import md5
+from nltk.tokenize import sent_tokenize
+import re
+from urllib.parse import urlparse
+from os.path import splitext
+from datetime import datetime
+import json
+
+from flata import Flata, JSONStorage
+
+from bs4 import BeautifulSoup as beautiful_soup
+import html5lib
+
+import pdfminer.high_level as pdfminer
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+import pdf2image
+import pytesseract
+
+from striprtf.striprtf import rtf_to_text
+
+import asyncio
+import aiofiles
+
+from aiohttp import ClientSession, BadContentDispositionHeader
+
+from tqdm.asyncio import tqdm as aio_tqdm
+import warnings
+
+from iris_utils.parse_args import parse_io
+
+
+## TYPE HINTS ##
+
+from typing import List, Tuple, Set, TypedDict
+from pathlib import PosixPath
+from flata.database import Table as fa_Table
+class LineDict(TypedDict):
+ db_id: int
+ vpm_page: str
+ patent_ids: int
+
+
+## WARNINGS SUPPRESSION ##
+
+# Suppress PDF text extraction not allowed warning
+# and any other warning from the `pdfminer` module
+warnings.filterwarnings('ignore', module = 'pdfminer')
+
+# Suppress BadContentDispositionHeader warning
+# from the `aiohttp` module
+warnings.simplefilter('ignore', BadContentDispositionHeader)
+
+
+#################
+# SETTINGS #
+#################
+
+# Name of the filder where the local copy of the pages have been saved
+files_folder = 'files'
+
+# User agent
+# Useful for both the type of documents (HTML and others) considered in the script
+USER_AGENT = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) '
+ 'Gecko/2009021910 Firefox/3.0.7')
+
+# Length of the texts in the corpus of each document
+# Number of characters, extracted from the full text of each document,
+# before and after the keywords defined afterward
+CONTEXT_SPAN = 500
+
+# Choose the name of the log file where eventual errors will be reported
+# The file will have a name like pre_classify_%Y_%m_%d_%H_%M.log
+LOG_FILE = 'post_classify'
+
+## ASYNCIO SETTINGS ##
+
+# Run no more than 25 tasks at a time
+NUM_CONCURRENT_TASKS = 25
+SEMAPHORE = asyncio.Semaphore(NUM_CONCURRENT_TASKS)
+
+################
+# REGEX #
+################
+
+# Punctuation characters that will be removed
+PUNCT_RE = re.compile(r'[\n\f\r\t\x0A\x0C\x0D\x09\s]+')
+
+# Regular expressions used to convert a URL into a file name
+HTTPWWW_RE = re.compile(r'^(.*:\/\/)?(www\.)?', flags = re.IGNORECASE)
+NOALPHA_RE = re.compile(r'\W')
+
+# Regular expression used to remove the sentences about cookie or privacy policy
+# Useful to remove useless portions of the headers and footers
+COOKIE_RE = re.compile(r'(cookie)|(privacy policy)', flags=re.IGNORECASE)
+
+# Regular expression
+PATNUM_RE = re.compile(r'\d{1,2},?\d{3},?\d{3}')
+
+
+#################
+# FUNCTIONS #
+#################
+
+def read_input(f_in: str) -> fa_Table:
+ """
+ Read the input file
+ """
+
+ DB = Flata(f_in, storage=JSONStorage)
+ database = DB.table('iris_vpm_pages_classifier')
+
+ return database
+
+def generate_file_name(url: str) -> str:
+ """
+ Given the URL provided, return a standardized file name
+ """
+
+ # Remove 'https://', 'ftp://' and similar things, and remove 'www'
+ file_name = HTTPWWW_RE.sub('', url)
+
+ # Replace any non-alphanumeric chars with '_'
+ file_name = NOALPHA_RE.sub('_', file_name)
+
+ # If the generated filename is longer than 250 bytes
+ # (i.e., about the lenght-limit for an ext4 file system),
+ # then use as name an hash hexdigest string
+ if len(file_name.encode()) >= 250:
+ file_name = md5(file_name.encode()).hexdigest()
+
+ return file_name
+
+def which_content_type_exists(file_path: str) -> str:
+ """
+ Returns the content type based on which file exists locally
+ Returns None if no file exists for the document of interest
+ """
+ for content_type in ['html', 'txt', 'rtf', 'pdf', 'other']:
+ # NB PDF must always be the last one, since also HTML contents
+ # have a PDF version (and potentialy other types
+ # will do the same in the future)
+ type_path = file_path.replace('.pdf', f'.{content_type}')
+ if os.path.exists(type_path):
+ return content_type.upper()
+ return None
+
+async def get_content_type(url: str, file_path: str, requests_session: ClientSession) -> str:
+ """
+ Determine the type of content returned by a GET request to the URL provided
+ The possible answers are:
+ - HTML, PDF, TXT (documents handled by the script)
+ - OTHER (documents unhandled by the script)
+ - FAILED (generic error while connecting with the remote source)
+ """
+
+ local_content_type = which_content_type_exists(
+ file_path = file_path)
+ if local_content_type:
+ return local_content_type
+
+ # If the URL names a file that ends in *.pdf (*.txt) its a PDF (TXT)
+ url_path = urlparse(url).path
+ url_root, url_ext = splitext(url_path.lower())
+ if url_ext.endswith('pdf'):
+ return 'PDF'
+ if url_ext.endswith('txt'):
+ return 'TXT'
+
+ try:
+ # Require the HEAD for the URL
+ response = await requests_session.request(
+ method = 'HEAD',
+ url = url,
+ headers = {'User-Agent': USER_AGENT},
+ allow_redirects = True,
+ ssl = False)
+
+ # assert response.status in [200, 403]
+
+ # Take the content-type from the HEAD
+ remote_content_type = response.content_type
+
+ except:
+ return 'FAILED'
+
+ # Is the content-type a PDF?
+ if remote_content_type and remote_content_type.startswith('application/pdf'):
+ return 'PDF'
+
+ # Is the content-type an RTF?
+ if remote_content_type and remote_content_type.startswith('application/rtf'):
+ return 'RTF'
+
+ # Is the content-type a plain text?
+ if remote_content_type and remote_content_type.startswith('text/plain'):
+ return 'TXT'
+
+ # Is the content-type a stream of data?
+ if remote_content_type and remote_content_type.startswith('application/octet-stream'):
+ try:
+ # Take the content-disposition from the HEAD
+ content_disposition = response.content_disposition
+ # Take the filename field from the content-disposition
+ content_disposition = re.search(r'filename = "(.*)"', content_disposition)
+ except:
+ return 'FAILED'
+ # Is the file a PDF?
+ if content_disposition and \
+ any([splitext(group.lower())[1].endswith('pdf') \
+ for group in content_disposition.groups()]):
+ return 'PDF'
+ if content_disposition and \
+ any([splitext(group.lower())[1].endswith('rtf') \
+ for group in content_disposition.groups()]):
+ return 'RTF'
+ # Is the file a TXT?
+ if content_disposition and \
+ any([splitext(group.lower())[1].endswith('txt') \
+ for group in content_disposition.groups()]):
+ return 'TXT'
+ # Is the file something else?
+ else:
+ return 'OTHER'
+
+ # Is the content-type an HTML?
+ if remote_content_type and remote_content_type.startswith('text/html'):
+ return 'HTML'
+
+ # Is the content-type something else?
+ return 'OTHER'
+
+async def get_content_from_url(url: str, requests_session: ClientSession) -> bytes:
+ """
+ Download the document from the URL provided, store it locally and return it
+ """
+
+ try:
+ # Download the content from the URL
+ response = await requests_session.request(
+ method = 'GET',
+ url = url,
+ headers = {'User-Agent': USER_AGENT},
+ allow_redirects = True,
+ ssl = False)
+ assert response.status == 200
+ except:
+ text_bytes = b''
+ else:
+ # Read the downloaded content
+ try:
+ text_bytes = await response.read()
+ except:
+ text_bytes = b''
+
+ # Return the content
+ return text_bytes
+
+async def get_text_from_txt(url: str, file_path: str, requests_session: ClientSession) -> str:
+ """
+ Extract the text from the TXT file provided (or downloaded from the URL provided)
+ """
+
+ if os.path.exists(file_path):
+ with open(file_path, 'rb') as f_in:
+ text_bytes = f_in.read()
+ else:
+ text_bytes = await get_content_from_url(
+ url = url,
+ requests_session = requests_session)
+ text = text_bytes.decode(errors='ignore')
+
+ return text
+
+async def get_text_from_pdf(url: str, file_path: str, requests_session: ClientSession, use_ocr = False) -> str:
+ """
+ Extract the text from the PDF file provided (or downloaded from the URL provided)
+ """
+
+ if os.path.exists(file_path):
+ with open(file_path, 'rb') as f_in:
+ text_bytes = f_in.read()
+ else:
+ text_bytes = await get_content_from_url(
+ url = url,
+ requests_session = requests_session)
+
+ if use_ocr:
+ try:
+ pdf_parser = PDFParser(BytesIO(text_bytes))
+ pdf = PDFDocument(pdf_parser)
+ n_pages = pdf.catalog['Pages'].resolve()['Count']
+ # Analyze the document only if it is shorter than 30 pages
+ if n_pages<30:
+ pages = pdf2image.convert_from_bytes(text_bytes, grayscale = True)
+
+ text = ''
+ for page in pages:
+ page = pytesseract.image_to_string(page, lang = 'eng')
+ text += page
+ except:
+ text = ''
+ else:
+ try:
+ text = pdfminer.extract_text(BytesIO(text_bytes))
+ except:
+ text = ''
+
+ return text
+
+async def get_text_from_rtf(url: str, file_path: str, requests_session: ClientSession) -> str:
+ """
+ Extract the text from the RTF file provided (or downloaded from the URL provided)
+ """
+
+ if os.path.exists(file_path):
+ with open(file_path, 'rb') as f_in:
+ text_bytes = f_in.read()
+ else:
+ text_bytes = await get_content_from_url(
+ url = url,
+ requests_session = requests_session)
+
+ try:
+ text = text_bytes.decode(errors='ignore')
+ text = rtf_to_text(text)
+ except:
+ text = ''
+
+ return text
+
+async def get_text_from_html(url: str, file_path: str, requests_session: ClientSession) -> Tuple[str, List[str]]:
+ """
+ Extract the text from the body of the document,
+ using the local version of the website (or try to create one)
+ """
+
+ html_path = file_path.replace('.pdf', '.html')
+
+ # Use the, previously stored, local HTML version of the URL, if exists
+ try:
+ if html_path and os.path.exists(html_path):
+ with open(html_path, 'r') as f_in:
+ html_soup = beautiful_soup(f_in, 'html5lib')
+ else:
+ text_bytes = await get_content_from_url(
+ url = url,
+ requests_session = requests_session)
+ html_soup = beautiful_soup(text_bytes, 'html5lib')
+ except:
+ text = ''
+ else:
+ try:
+ # Remove