Skip to content

Commit

Permalink
Merge branch 'dev' into kw-pipeline-notebook-merge
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunhutch committed Jun 21, 2023
2 parents 39573c1 + 2aa5079 commit 55bf95d
Show file tree
Hide file tree
Showing 64 changed files with 9,564 additions and 72 deletions.
93 changes: 93 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Git
.git
.gitignore

# CI
.codeclimate.yml
.travis.yml
.taskcluster.yml

# Docker
docker-compose.yml
.docker

# Byte-compiled / optimized / DLL files
__pycache__/
*/__pycache__/
*/*/__pycache__/
*/*/*/__pycache__/
*.py[cod]
*/*.py[cod]
*/*/*.py[cod]
*/*/*/*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Virtual environment
.env/
.venv/
venv/

# PyCharm
.idea

# Python mode for VIM
.ropeproject
*/.ropeproject
*/*/.ropeproject
*/*/*/.ropeproject

# Vim swap files
*.swp
*/*.swp
*/*/*.swp
*/*/*/*.swp
38 changes: 19 additions & 19 deletions .github/workflows/pull-request-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# ignore files in models folder but keep .gitkeep
models/ner/*
results/ner/*
!.gitkeep

# exclude all txt files in data
data/**/*.txt
# include all json files in data
Expand Down
Binary file added assets/data-review-tool.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/ffossils-logo-text.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/hugging-face-metaextractor.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/project-flow-diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: "3.9"
services:
data-review-tool:
image: metaextractor-data-review-tool:v0.0.1
build:
dockerfile: ./docker/data-review-tool/Dockerfile
context: .
ports:
- "8050:8050"
volumes:
- ./data/data-review-tool:/MetaExtractor/data/data-review-tool
entity-extraction-pipeline:
image: metaextractor-entity-extraction-pipeline:v0.0.2
build:
dockerfile: ./docker/entity-extraction-pipeline/Dockerfile
context: .
args:
HF_NER_MODEL_NAME: "roberta-finetuned-v3"
SPACY_NER_MODEL_NAME: "spacy-transformer-v3"
ports:
- "5000:5000"
volumes:
- ./data/entity-extraction/raw/original_files/:/inputs/
- ./data/entity-extraction/processed/processed_articles/:/outputs/
environment:
- USE_NER_MODEL_TYPE=huggingface
- LOG_OUTPUT_DIR=/outputs/
- MAX_SENTENCES=20
- MAX_ARTICLES=1
24 changes: 24 additions & 0 deletions docker/data-review-tool/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Use the official Python base image with your desired version
FROM python:3.10

# Copy the requirements.txt file to the working directory
COPY ./docker/data-review-tool/requirements.txt .

# Install the Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

RUN git clone https://github.com/NeotomaDB/MetaExtractor

WORKDIR MetaExtractor/

RUN git switch dev

# Expose the port your Dash app is running on
EXPOSE 8050

# Set the entrypoint command to run your Dash app
#CMD ["python", "src/data_review_tool/app.py"]

ENTRYPOINT python src/data_review_tool/app.py

# VOLUME [ "/MetaExtractor/data/data-review-tool" ]
33 changes: 33 additions & 0 deletions docker/data-review-tool/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Finding Fossils - Data Review Tool Docker Image

This docker image contains `Finding Fossils`, a data review tool built using Dash, Python. It is used to visualize the outputs of the models and verify the extracted entities for inclusion in the Neotoma Database.

## Docker Compose Setup

We first build the docker image to install the required dependencies that can be run using `docker-compose` as follows:
```bash
docker-compose build
docker-compose up data-review-tool
```

This is the basic docker compose configuration for running the image.

```yaml
version: "3.9"
services:
data-review-tool:
build:
...
ports:
- "8050:8050"
volumes:
- ./data/data-review-tool:/MetaExtractor/data/data-review-tool
```
### Input
The expected inputs are mounted onto the newly created container as volumes and can be dumped in the `data/data-review-tool` folder. The artifacts required by the data review tool to verify a batch of processed articles are:
- A parquet file containing the outputs from the article relevance prediction component.
- A zipped file containing the outputs from the named entity extraction component.

### Output
Once the articles have been verified and the container has been destroyed, we update the same parquet file referenced in the `Input` with the extracted (predicted by the model) and verified (correct by data steward) entities.
8 changes: 8 additions & 0 deletions docker/data-review-tool/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
dash==2.9.3
dash_bootstrap_components==1.4.1
dash_iconify==0.1.2
dash_mantine_components==0.12.1
numpy==1.24.3
pandas==1.5.3
plotly==5.14.1
seaborn==0.12.2
47 changes: 47 additions & 0 deletions docker/entity-extraction-pipeline/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Use the official Python 3.10 image as the base image
FROM python:3.10

# Set the working directory inside the container
WORKDIR /app/

# Copy the requirements file to the container
COPY docker/entity-extraction-pipeline/requirements.txt .

# Install the required Python packages
RUN pip install --no-cache-dir -r requirements.txt
RUN python -m nltk.downloader stopwords

# Copy the entire repository folder into the container
COPY src ./src

# Build args
ARG HF_NER_MODEL_NAME
ARG SPACY_NER_MODEL_NAME

# Set env variables for when running the container
ENV HF_NER_MODEL_NAME=${HF_NER_MODEL_NAME}
ENV SPACY_NER_MODEL_NAME=${SPACY_NER_MODEL_NAME}
ENV USE_NER_MODEL_TYPE=huggingface
ENV MAX_ARTICLES=-1
ENV MAX_SENTENCES=-1

# Copy in the model defined by the env variable NER_MODEL_NAME from models folder
COPY models/ner/${HF_NER_MODEL_NAME} ./models/ner/${HF_NER_MODEL_NAME}
COPY models/ner/${SPACY_NER_MODEL_NAME} ./models/ner/${SPACY_NER_MODEL_NAME}

# non-root user control inspired from here: https://stackoverflow.com/questions/66349101/docker-non-root-user-does-not-have-writing-permissions-when-using-volumes
# Create a non-root user that owns the input/outputs directory by default
RUN useradd -r extraction-user # no specific user ID
RUN mkdir /inputs && chown extraction-user /inputs
RUN mkdir /outputs && chown extraction-user /outputs
# Mount the "inputs" and "outputs" folders as volumes
VOLUME ["/inputs", "/outputs"]

# Set the entry point and command to run the script
USER extraction-user
RUN ls -alp /app
ENTRYPOINT python src/pipeline/entity_extraction_pipeline.py \
--article_text_path /inputs/ \
--output_path /outputs/ \
--max_articles ${MAX_ARTICLES} \
--max_sentences ${MAX_SENTENCES}
46 changes: 46 additions & 0 deletions docker/entity-extraction-pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Meta Extractor Entity Extraction Pipeline Docker Image

This docker image contains the models and code required to run entity extraction from research articles on the xDD system. It assumes the following:
1. The raw text is input in the `nlp352` TSV format with either a single article per file or multiple articles denoted by GDD ID
- like this sample data from xDD [Link to Sample Data](https://github.com/UW-xDD/xdd-docker-recipe/tree/master/sample_data/nlp352)
2. The raw input data is mounted as a volume to the docker folder `/app/inputs/`
3. The expected output location is mounted as a volume to the docker folder `/app/outputs/`
4. A single JSON file per article is exported into the output folder along with a `.log` file for the processing run.
5. An environment variable `LOG_OUTPUT_DIR` is set to the path of the output folder. This is used to write the log file. Default is the directory from which the docker container is run.

## Additional Options Enabled by Environment Variables

The following environment variables can be set to change the behavior of the pipeline:
- `USE_NER_MODEL_TYPE`: This variable can be set to `spacy` or `huggingface` to change the NER model used. The default is `huggingface`. This will be used to run batches with each model to evaluate final performance.
- `MAX_SENTENCES`: This variable can be set to a number to limit the number of sentences processed per article. This is useful for testing and debugging. The default is `-1` which means no limit.
- `MAX_ARTICLES`: This variable can be set to a number to limit the number of articles processed. This is useful for testing and debugging. The default is `-1` which means no limit.

## Sample Docker Run & Compose Setup

Below is a sample docker run command for running the image:
- the `$(id -u)` is used to run the docker container as the current user so that the output files are not owned by root
- the `LOG_OUTPUT_DIR="../outputs/"` is different from the docker compose as it is relative to the current directory which from Docker run starts in `app` folder
- for git bash on windows the `/${PWD}` is used to get the current directory and the forward slash is important to get the correct path
```bash
docker run -u $(id -u) -p 5000:5000 -v /${PWD}/data/entity-extraction/raw/original_files/:/inputs/ -v /${PWD}/data/entity-extraction/processed/processed_articles/:/outputs/ --env LOG_OUTPUT_DIR="../outputs/" metaextractor-entity-extraction-pipeline:v0.0.2
```

Below is a sample docker compose configuration for running the image:
```yaml
version: "0.0.1"
services:
entity-extraction-pipeline:
image: metaextractor-entity-extraction-pipeline:v0.0.1
build:
...
ports:
- "5000:5000"
volumes:
- ./data/raw/:/app/inputs/
- ./data/processed/:/app/outputs/
environment:
- USE_NER_MODEL_TYPE=huggingface
- LOG_OUTPUT_DIR=/app/outputs/
- MAX_SENTENCES=20
- MAX_ARTICLES=1
```
14 changes: 14 additions & 0 deletions docker/entity-extraction-pipeline/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# python version 3.10
pandas==2.0.1
pytest~=7.3
seaborn~=0.12
seqeval==1.2.2
nltk==3.8.1
spacy==3.5.3
docopt-ng~=0.8
transformers~=4.24
numpy~=1.23
python-dotenv~=1.0
tqdm~=4.65
torch~=1.12
spacy-transformers~=1.1
Empty file added models/.gitkeep
Empty file.
Empty file added models/ner/.gitkeep
Empty file.
Loading

0 comments on commit 55bf95d

Please sign in to comment.