Merge pull request #19 from TheScienceMuseum/develop

v1.0.0
TheScienceMuseum · May 20, 2021 · 7447cd4 · 7447cd4
2 parents 415afab + 65f2824
commit 7447cd4
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -96,7 +96,7 @@ ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
     -   repo: https://github.com/ambv/black
-        rev: stable
+        rev: 21.5b1
         hooks:
         - id: black
-          language_version: python3.7
+          language_version: python3
     -   repo: https://gitlab.com/pycqa/flake8
-        rev: 3.7.9
+        rev: 3.9.2
         hooks:
         - id: flake8
     -   repo: https://github.com/pre-commit/pre-commit-hooks
-        rev: v3.1.0 
+        rev: v4.0.1 
         hooks:
         -   id: check-json
         -   id: check-merge-conflict
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes documented below.
 
+## 1.0.0
+- **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists.
+- **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout.
+
 ## 0.3.7
 - **fix:** reading from JSON dump forces utf-8
 ## 0.3.6

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H
   - [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson)
   - [Loading from SPARQL query](#loading-from-sparql-query)
   - [Temporary side effects](#temporary-side-effects)
-  
+
 </br>
 
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/elastic-wikidata)
@@ -62,7 +62,7 @@ A full list of options can be found with `ew --help`, but the following are like
 
 - `--index/-i`: the index name to push to. If not specified at runtime, elastic-wikidata will prompt for it
 - `--limit/-l`: limit the number of records pushed into ES. You might want to use this for a small trial run before importing the whole thing.
-- `--properties/-prop`: pass a comma-separated list of properties to include in the ES index. E.g. *p31,p21*.
+- `--properties/-prop`: a whitespace-separated list of properties to include in the ES index e.g. *'p31 p21'*, or the path to a text file containing newline-separated properties e.g. [this one](./pids.sample.cfg).
 - `--language/-lang`: [Wikimedia language code](https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all). Only one supported at this time.
 
 ### Loading from Wikidata dump (.ndjson)

diff --git a/cli.py b/cli.py
@@ -1,5 +1,6 @@
 from elastic_wikidata import dump_to_es, sparql_to_es
 from elastic_wikidata.config import runtime_config
+import os
 import click
 from configparser import ConfigParser
 
@@ -44,7 +45,7 @@
     "--properties",
     "-prop",
     type=str,
-    help="One or more Wikidata property e.g. p31 or p31,p21. Not case-sensitive",
+    help="One or more Wikidata property e.g. 'p31' or 'p31 p21'. A path to a file containing newline-separated properties can also be passed. Not case-sensitive",
 )
 @click.option(
     "--timeout",
@@ -56,7 +57,7 @@
 @click.option(
     "--disable_refresh/--no_disable_refresh",
     "-dr/-ndr",
-    help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
+    help="Whether to disable Elasticsearch's (CPU-intensive) refresh during data load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
     default=True,
 )
 def main(
@@ -117,7 +118,12 @@ def main(
     if language:
         kwargs["lang"] = language
     if properties:
-        kwargs["properties"] = properties.split(",")
+        if os.path.exists(properties):
+            with open(properties, "r") as f:
+                kwargs["properties"] = f.read().splitlines()
+        else:
+            kwargs["properties"] = properties.split()
+
     if disable_refresh:
         kwargs["disable_refresh_on_index"] = disable_refresh
 
@@ -152,7 +158,7 @@ def load_from_sparql(path, es_credentials, index, limit, page_size=100, **kwargs
         query = f.read()
 
     # limit is used when getting list of entities
-    print(f"Getting entities from SPARQL query")
+    print("Getting entities from SPARQL query")
     entity_list = sparql_to_es.get_entities_from_query(
         query, page_size=100, limit=limit
     )

diff --git a/elastic_wikidata/dump_to_es.py b/elastic_wikidata/dump_to_es.py
@@ -2,7 +2,7 @@
 from itertools import islice
 from tqdm.auto import tqdm
 from elasticsearch import Elasticsearch
-from elasticsearch.helpers import parallel_bulk
+from elasticsearch.helpers import streaming_bulk
 from typing import Union
 from elastic_wikidata.wd_entities import (
     get_entities,
@@ -87,11 +87,16 @@ def start_elasticsearch(self):
                     self.es_credentials["ELASTICSEARCH_USER"],
                     self.es_credentials["ELASTICSEARCH_PASSWORD"],
                 ),
+                max_retries=100,
+                retry_on_timeout=True,
             )
         else:
             # run on localhost
             print("Connecting to Elasticsearch on localhost")
-            self.es = Elasticsearch()
+            self.es = Elasticsearch(
+                max_retries=100,
+                retry_on_timeout=True,
+            )
 
         mappings = {
             "mappings": {
@@ -122,28 +127,31 @@ def dump_to_es(self):
         elif self.entities:
             action_generator = self.generate_actions_from_entities()
 
-        for ok, action in tqdm(
-            parallel_bulk(
-                client=self.es,
-                index=self.index_name,
-                actions=action_generator,
-                chunk_size=self.config["chunk_size"],
-                queue_size=self.config["queue_size"],
-            ),
-        ):
-            if not ok:
-                print(action)
-                errors.append(action)
-            successes += ok
+        try:
+            for ok, action in tqdm(
+                streaming_bulk(
+                    client=self.es,
+                    index=self.index_name,
+                    actions=action_generator,
+                    chunk_size=self.config["chunk_size"],
+                    # queue_size=self.config["queue_size"],
+                    max_retries=3,
+                ),
+            ):
+                if not ok:
+                    print(action)
+                    errors.append(action)
+                successes += ok
 
-        if self.disable_refresh_on_index:
-            # reset back to default
-            print("Refresh interval set back to default of 1s.")
-            self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
+        finally:
+            if self.disable_refresh_on_index:
+                # reset back to default
+                print("Refresh interval set back to default of 1s.")
+                self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
 
     def process_doc(self, doc: dict) -> dict:
         """
-        Processes a single document from the JSON dump, returning a filtered version of that document. 
+        Processes a single document from the JSON dump, returning a filtered version of that document.
         """
 
         lang = self.wiki_options["lang"]
@@ -153,8 +161,8 @@ def process_doc(self, doc: dict) -> dict:
 
     def generate_actions_from_dump(self):
         """
-        Generator to yield a processed document from the Wikidata JSON dump. 
-        Each line of the Wikidata JSON dump is a separate document. 
+        Generator to yield a processed document from the Wikidata JSON dump.
+        Each line of the Wikidata JSON dump is a separate document.
         """
         with open(self.dump_path, "r", encoding="utf-8") as f:
             objects = (json.loads(line) for line in f)
@@ -170,7 +178,7 @@ def generate_actions_from_dump(self):
 
     def generate_actions_from_entities(self):
         """
-        Generator to yield processed document from list of entities. Calls are made to 
+        Generator to yield processed document from list of entities. Calls are made to
         wbgetentities API with page size of 50 to retrieve documents.
         """
 

diff --git a/pids.sample.cfg b/pids.sample.cfg
@@ -0,0 +1,3 @@
+P31
+P279
+P18
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+click==7.1.2
+elasticsearch==7.8.1
+SPARQLWrapper==1.8.5
+tqdm>=4.48.2
+requests==2.24.0
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -0,0 +1,6 @@
+pre-commit
+black
+pytest
+pylint
+flake8
+jupyterlab
diff --git a/setup.py b/setup.py
@@ -5,13 +5,13 @@
 
 setuptools.setup(
     name="elastic-wikidata",
-    version="0.3.7",
+    version="1.0.0",
     author="Science Museum Group",
     description="elastic-wikidata",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/TheScienceMuseum/elastic-wikidata",
-    download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v0.3.2.tar.gz",
+    download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v1.0.0.tar.gz",
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",