NeotomaDB · tieandrews · Jun 19, 2023 · Jun 6, 2023 · Jun 6, 2023 · Jun 7, 2023
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,93 @@
+# Git
+.git
+.gitignore
+
+# CI
+.codeclimate.yml
+.travis.yml
+.taskcluster.yml
+
+# Docker
+docker-compose.yml
+.docker
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*/__pycache__/
+*/*/__pycache__/
+*/*/*/__pycache__/
+*.py[cod]
+*/*.py[cod]
+*/*/*.py[cod]
+*/*/*/*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Virtual environment
+.env/
+.venv/
+venv/
+
+# PyCharm
+.idea
+
+# Python mode for VIM
+.ropeproject
+*/.ropeproject
+*/*/.ropeproject
+*/*/*/.ropeproject
+
+# Vim swap files
+*.swp
+*/*.swp
+*/*/*.swp
+*/*/*/*.swp
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,23 @@
+version: "0.0.1"
+services:
+  entity-extraction-pipeline:
+    image: metaextractor-entity-extraction-pipeline:v0.0.2
+    build: 
+      dockerfile: ./docker/entity-extraction-pipeline/Dockerfile
+      context: .
+      args:
+        HF_NER_MODEL_NAME: "roberta-finetuned-v3"
+        SPACY_NER_MODEL_NAME: "spacy-transformer-v3"
+    ports:
+      - "5000:5000"
+    volumes:
+    - ./data/entity-extraction/raw/original_files/:/inputs/
+    - ./data/entity-extraction/processed/processed_articles/:/outputs/
+    environment:
+      - USE_NER_MODEL_TYPE=huggingface
+      - LOG_OUTPUT_DIR=/outputs/
+      - MAX_SENTENCES=20
+      - MAX_ARTICLES=1
+
+
+
diff --git a/docker/entity-extraction-pipeline/Dockerfile b/docker/entity-extraction-pipeline/Dockerfile
@@ -0,0 +1,47 @@
+# Use the official Python 3.10 image as the base image
+FROM python:3.10
+
+# Set the working directory inside the container
+WORKDIR /app/
+
+# Copy the requirements file to the container
+COPY docker/entity-extraction-pipeline/requirements.txt .
+
+# Install the required Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m nltk.downloader stopwords
+
+# Copy the entire repository folder into the container
+COPY src ./src
+
+# Build args
+ARG HF_NER_MODEL_NAME
+ARG SPACY_NER_MODEL_NAME
+
+# Set env variables for when running the container
+ENV HF_NER_MODEL_NAME=${HF_NER_MODEL_NAME}
+ENV SPACY_NER_MODEL_NAME=${SPACY_NER_MODEL_NAME}
+ENV USE_NER_MODEL_TYPE=huggingface
+ENV MAX_ARTICLES=-1
+ENV MAX_SENTENCES=-1
+
+# Copy in the model defined by the env variable NER_MODEL_NAME from models folder
+COPY models/ner/${HF_NER_MODEL_NAME} ./models/ner/${HF_NER_MODEL_NAME} 
+COPY models/ner/${SPACY_NER_MODEL_NAME} ./models/ner/${SPACY_NER_MODEL_NAME}
+
+# non-root user control inspired from here: https://stackoverflow.com/questions/66349101/docker-non-root-user-does-not-have-writing-permissions-when-using-volumes
+# Create a non-root user that owns the input/outputs directory by default
+RUN useradd -r extraction-user          # no specific user ID
+RUN mkdir /inputs && chown extraction-user /inputs
+RUN mkdir /outputs && chown extraction-user /outputs
+# Mount the "inputs" and "outputs" folders as volumes
+VOLUME ["/inputs", "/outputs"]
+
+# Set the entry point and command to run the script
+USER extraction-user  
+RUN ls -alp /app
+ENTRYPOINT python src/pipeline/entity_extraction_pipeline.py \
+    --article_text_path /inputs/ \
+    --output_path /outputs/ \
+    --max_articles ${MAX_ARTICLES} \
+    --max_sentences ${MAX_SENTENCES}
diff --git a/docker/entity-extraction-pipeline/README.md b/docker/entity-extraction-pipeline/README.md
@@ -0,0 +1,46 @@
+# Meta Extractor Entity Extraction Pipeline Docker Image
+
+This docker image contains the models and code required to run entity extraction from research articles on the xDD system. It assumes the following:
+1. The raw text is input in the `nlp352` TSV format with either a single article per file or multiple articles denoted by GDD ID
+   -  like this sample data from xDD [Link to Sample Data](https://github.com/UW-xDD/xdd-docker-recipe/tree/master/sample_data/nlp352)
+2. The raw input data is mounted as a volume to the docker folder `/app/inputs/`
+3. The expected output location is mounted as a volume to the docker folder `/app/outputs/`
+4. A single JSON file per article is exported into the output folder along with a `.log` file for the processing run.
+5. An environment variable `LOG_OUTPUT_DIR` is set to the path of the output folder. This is used to write the log file. Default is the directory from which the docker container is run.
+
+## Additional Options Enabled by Environment Variables
+
+The following environment variables can be set to change the behavior of the pipeline:
+- `USE_NER_MODEL_TYPE`: This variable can be set to `spacy` or `huggingface` to change the NER model used. The default is `huggingface`. This will be used to run batches with each model to evaluate final performance.
+- `MAX_SENTENCES`: This variable can be set to a number to limit the number of sentences processed per article. This is useful for testing and debugging. The default is `-1` which means no limit.
+- `MAX_ARTICLES`: This variable can be set to a number to limit the number of articles processed. This is useful for testing and debugging. The default is `-1` which means no limit.
+
+## Sample Docker Run & Compose Setup
+
+Below is a sample docker run command for running the image:
+- the `$(id -u)` is used to run the docker container as the current user so that the output files are not owned by root
+- the `LOG_OUTPUT_DIR="../outputs/"` is different from the docker compose as it is relative to the current directory which from Docker run starts in `app` folder
+- for git bash on windows the `/${PWD}` is used to get the current directory and the forward slash is important to get the correct path
+```bash
+docker run -u $(id -u) -p 5000:5000 -v /${PWD}/data/entity-extraction/raw/original_files/:/inputs/ -v /${PWD}/data/entity-extraction/processed/processed_articles/:/outputs/ --env LOG_OUTPUT_DIR="../outputs/" metaextractor-entity-extraction-pipeline:v0.0.2
+```
+
+Below is a sample docker compose configuration for running the image:
+```yaml
+version: "0.0.1"
+services:
+  entity-extraction-pipeline:
+    image: metaextractor-entity-extraction-pipeline:v0.0.1
+    build: 
+        ...
+    ports:
+      - "5000:5000"
+    volumes:
+    - ./data/raw/:/app/inputs/
+    - ./data/processed/:/app/outputs/
+    environment:
+      - USE_NER_MODEL_TYPE=huggingface
+      - LOG_OUTPUT_DIR=/app/outputs/
+      - MAX_SENTENCES=20
+      - MAX_ARTICLES=1
+```
diff --git a/docker/entity-extraction-pipeline/requirements.txt b/docker/entity-extraction-pipeline/requirements.txt
@@ -0,0 +1,14 @@
+# python version 3.10
+pandas==2.0.1
+pytest~=7.3
+seaborn~=0.12
+seqeval==1.2.2
+nltk==3.8.1
+spacy==3.5.3
+docopt-ng~=0.8
+transformers~=4.24
+numpy~=1.23
+python-dotenv~=1.0
+tqdm~=4.65
+torch~=1.12
+spacy-transformers~=1.1
diff --git a/requirements.txt b/requirements.txt
@@ -11,5 +11,8 @@ scikit-learn~=1.2
 pytest-cov~=4.0
 flake8~=6.0
 docopt-ng~=0.8
+python-dotenv~=1.0
+transformers~=4.24
+torch~=1.12
 # to use the spacy model for baseline NER
 https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl
diff --git a/src/entity_extraction/baseline_entity_extraction.py b/src/entity_extraction/baseline_entity_extraction.py
@@ -10,10 +10,6 @@
 import nltk
 import spacy
 
-# ensure stopwords are downloaded
-nltk.download("stopwords")
-
-from nltk.corpus import stopwords
 
 # ensure that the parent directory is on the path for relative imports
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -54,6 +50,10 @@ def clean_name(name):
         if len(word) > 2 and len(word) <= 25:
             all_taxa_words.append(word)
 
+    # ensure stopwords are downloaded
+    nltk.download("stopwords", quiet=True)
+    from nltk.corpus import stopwords
+
     stop = stopwords.words()
     all_taxa_words = list(set(all_taxa_words))
     all_taxa_words = [word for word in all_taxa_words if word not in stop]