Skip to content

Commit

Permalink
Merge pull request #19 from TheScienceMuseum/develop
Browse files Browse the repository at this point in the history
v1.0.0
  • Loading branch information
kdutia authored May 20, 2021
2 parents 415afab + 65f2824 commit 7447cd4
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand Down
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
repos:
- repo: https://github.com/ambv/black
rev: stable
rev: 21.5b1
hooks:
- id: black
language_version: python3.7
language_version: python3
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
rev: 3.9.2
hooks:
- id: flake8
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.1.0
rev: v4.0.1
hooks:
- id: check-json
- id: check-merge-conflict
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes documented below.

## 1.0.0
- **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists.
- **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout.

## 0.3.7
- **fix:** reading from JSON dump forces utf-8
## 0.3.6
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H
- [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson)
- [Loading from SPARQL query](#loading-from-sparql-query)
- [Temporary side effects](#temporary-side-effects)

</br>

![PyPI - Downloads](https://img.shields.io/pypi/dm/elastic-wikidata)
Expand Down Expand Up @@ -62,7 +62,7 @@ A full list of options can be found with `ew --help`, but the following are like

- `--index/-i`: the index name to push to. If not specified at runtime, elastic-wikidata will prompt for it
- `--limit/-l`: limit the number of records pushed into ES. You might want to use this for a small trial run before importing the whole thing.
- `--properties/-prop`: pass a comma-separated list of properties to include in the ES index. E.g. *p31,p21*.
- `--properties/-prop`: a whitespace-separated list of properties to include in the ES index e.g. *'p31 p21'*, or the path to a text file containing newline-separated properties e.g. [this one](./pids.sample.cfg).
- `--language/-lang`: [Wikimedia language code](https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all). Only one supported at this time.

### Loading from Wikidata dump (.ndjson)
Expand Down
14 changes: 10 additions & 4 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from elastic_wikidata import dump_to_es, sparql_to_es
from elastic_wikidata.config import runtime_config
import os
import click
from configparser import ConfigParser

Expand Down Expand Up @@ -44,7 +45,7 @@
"--properties",
"-prop",
type=str,
help="One or more Wikidata property e.g. p31 or p31,p21. Not case-sensitive",
help="One or more Wikidata property e.g. 'p31' or 'p31 p21'. A path to a file containing newline-separated properties can also be passed. Not case-sensitive",
)
@click.option(
"--timeout",
Expand All @@ -56,7 +57,7 @@
@click.option(
"--disable_refresh/--no_disable_refresh",
"-dr/-ndr",
help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
help="Whether to disable Elasticsearch's (CPU-intensive) refresh during data load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
default=True,
)
def main(
Expand Down Expand Up @@ -117,7 +118,12 @@ def main(
if language:
kwargs["lang"] = language
if properties:
kwargs["properties"] = properties.split(",")
if os.path.exists(properties):
with open(properties, "r") as f:
kwargs["properties"] = f.read().splitlines()
else:
kwargs["properties"] = properties.split()

if disable_refresh:
kwargs["disable_refresh_on_index"] = disable_refresh

Expand Down Expand Up @@ -152,7 +158,7 @@ def load_from_sparql(path, es_credentials, index, limit, page_size=100, **kwargs
query = f.read()

# limit is used when getting list of entities
print(f"Getting entities from SPARQL query")
print("Getting entities from SPARQL query")
entity_list = sparql_to_es.get_entities_from_query(
query, page_size=100, limit=limit
)
Expand Down
54 changes: 31 additions & 23 deletions elastic_wikidata/dump_to_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from itertools import islice
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from elasticsearch.helpers import streaming_bulk
from typing import Union
from elastic_wikidata.wd_entities import (
get_entities,
Expand Down Expand Up @@ -87,11 +87,16 @@ def start_elasticsearch(self):
self.es_credentials["ELASTICSEARCH_USER"],
self.es_credentials["ELASTICSEARCH_PASSWORD"],
),
max_retries=100,
retry_on_timeout=True,
)
else:
# run on localhost
print("Connecting to Elasticsearch on localhost")
self.es = Elasticsearch()
self.es = Elasticsearch(
max_retries=100,
retry_on_timeout=True,
)

mappings = {
"mappings": {
Expand Down Expand Up @@ -122,28 +127,31 @@ def dump_to_es(self):
elif self.entities:
action_generator = self.generate_actions_from_entities()

for ok, action in tqdm(
parallel_bulk(
client=self.es,
index=self.index_name,
actions=action_generator,
chunk_size=self.config["chunk_size"],
queue_size=self.config["queue_size"],
),
):
if not ok:
print(action)
errors.append(action)
successes += ok
try:
for ok, action in tqdm(
streaming_bulk(
client=self.es,
index=self.index_name,
actions=action_generator,
chunk_size=self.config["chunk_size"],
# queue_size=self.config["queue_size"],
max_retries=3,
),
):
if not ok:
print(action)
errors.append(action)
successes += ok

if self.disable_refresh_on_index:
# reset back to default
print("Refresh interval set back to default of 1s.")
self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
finally:
if self.disable_refresh_on_index:
# reset back to default
print("Refresh interval set back to default of 1s.")
self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})

def process_doc(self, doc: dict) -> dict:
"""
Processes a single document from the JSON dump, returning a filtered version of that document.
Processes a single document from the JSON dump, returning a filtered version of that document.
"""

lang = self.wiki_options["lang"]
Expand All @@ -153,8 +161,8 @@ def process_doc(self, doc: dict) -> dict:

def generate_actions_from_dump(self):
"""
Generator to yield a processed document from the Wikidata JSON dump.
Each line of the Wikidata JSON dump is a separate document.
Generator to yield a processed document from the Wikidata JSON dump.
Each line of the Wikidata JSON dump is a separate document.
"""
with open(self.dump_path, "r", encoding="utf-8") as f:
objects = (json.loads(line) for line in f)
Expand All @@ -170,7 +178,7 @@ def generate_actions_from_dump(self):

def generate_actions_from_entities(self):
"""
Generator to yield processed document from list of entities. Calls are made to
Generator to yield processed document from list of entities. Calls are made to
wbgetentities API with page size of 50 to retrieve documents.
"""

Expand Down
3 changes: 3 additions & 0 deletions pids.sample.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
P31
P279
P18
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
click==7.1.2
elasticsearch==7.8.1
SPARQLWrapper==1.8.5
tqdm>=4.48.2
requests==2.24.0
6 changes: 6 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pre-commit
black
pytest
pylint
flake8
jupyterlab
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

setuptools.setup(
name="elastic-wikidata",
version="0.3.7",
version="1.0.0",
author="Science Museum Group",
description="elastic-wikidata",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/TheScienceMuseum/elastic-wikidata",
download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v0.3.2.tar.gz",
download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v1.0.0.tar.gz",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit 7447cd4

Please sign in to comment.