Merge branch 'dev' into kw-pipeline-notebook-merge

NeotomaDB · Jun 21, 2023 · 55bf95d · 55bf95d
2 parents 39573c1 + 2aa5079
commit 55bf95d
Show file tree

Hide file tree

Showing 64 changed files with 9,564 additions and 72 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,93 @@
+# Git
+.git
+.gitignore
+
+# CI
+.codeclimate.yml
+.travis.yml
+.taskcluster.yml
+
+# Docker
+docker-compose.yml
+.docker
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*/__pycache__/
+*/*/__pycache__/
+*/*/*/__pycache__/
+*.py[cod]
+*/*.py[cod]
+*/*/*.py[cod]
+*/*/*/*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Virtual environment
+.env/
+.venv/
+venv/
+
+# PyCharm
+.idea
+
+# Python mode for VIM
+.ropeproject
+*/.ropeproject
+*/*/.ropeproject
+*/*/*/.ropeproject
+
+# Vim swap files
+*.swp
+*/*.swp
+*/*/*.swp
+*/*/*/*.swp
diff --git a/.github/workflows/pull-request-testing.yml b/.github/workflows/pull-request-testing.yml
@@ -17,22 +17,22 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install flake8 pytest
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-      - name: Test with pytest
-        run: |
-          pytest
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# ignore files in models folder but keep .gitkeep
+models/ner/*
+results/ner/*
+!.gitkeep
+
 # exclude all txt files in data
 data/**/*.txt
 # include all json files in data

diff --git a/assets/data-review-tool.png b/assets/data-review-tool.png
diff --git a/assets/ffossils-logo-text.png b/assets/ffossils-logo-text.png
diff --git a/assets/hugging-face-metaextractor.png b/assets/hugging-face-metaextractor.png
diff --git a/assets/project-flow-diagram.png b/assets/project-flow-diagram.png
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,29 @@
+version: "3.9"
+services:
+  data-review-tool:
+    image: metaextractor-data-review-tool:v0.0.1
+    build:
+      dockerfile: ./docker/data-review-tool/Dockerfile
+      context: .
+    ports:
+      - "8050:8050"
+    volumes:
+      - ./data/data-review-tool:/MetaExtractor/data/data-review-tool
+  entity-extraction-pipeline:
+    image: metaextractor-entity-extraction-pipeline:v0.0.2
+    build: 
+      dockerfile: ./docker/entity-extraction-pipeline/Dockerfile
+      context: .
+      args:
+        HF_NER_MODEL_NAME: "roberta-finetuned-v3"
+        SPACY_NER_MODEL_NAME: "spacy-transformer-v3"
+    ports:
+      - "5000:5000"
+    volumes:
+    - ./data/entity-extraction/raw/original_files/:/inputs/
+    - ./data/entity-extraction/processed/processed_articles/:/outputs/
+    environment:
+      - USE_NER_MODEL_TYPE=huggingface
+      - LOG_OUTPUT_DIR=/outputs/
+      - MAX_SENTENCES=20
+      - MAX_ARTICLES=1 
diff --git a/docker/data-review-tool/Dockerfile b/docker/data-review-tool/Dockerfile
@@ -0,0 +1,24 @@
+# Use the official Python base image with your desired version
+FROM python:3.10
+
+# Copy the requirements.txt file to the working directory
+COPY ./docker/data-review-tool/requirements.txt .
+
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN git clone https://github.com/NeotomaDB/MetaExtractor
+
+WORKDIR MetaExtractor/
+
+RUN git switch dev
+
+# Expose the port your Dash app is running on
+EXPOSE 8050
+
+# Set the entrypoint command to run your Dash app
+#CMD ["python", "src/data_review_tool/app.py"]
+
+ENTRYPOINT python src/data_review_tool/app.py
+
+# VOLUME [ "/MetaExtractor/data/data-review-tool" ]
diff --git a/docker/data-review-tool/README.md b/docker/data-review-tool/README.md
@@ -0,0 +1,33 @@
+# Finding Fossils - Data Review Tool Docker Image
+
+This docker image contains `Finding Fossils`, a data review tool built using Dash, Python. It is used to visualize the outputs of the models and verify the extracted entities for inclusion in the Neotoma Database. 
+
+## Docker Compose Setup
+
+We first build the docker image to install the required dependencies that can be run using `docker-compose` as follows:
+```bash
+docker-compose build
+docker-compose up data-review-tool
+```
+
+This is the basic docker compose configuration for running the image.
+
+```yaml
+version: "3.9"
+services:
+  data-review-tool:
+    build: 
+      ...
+    ports:
+      - "8050:8050"
+    volumes:
+    - ./data/data-review-tool:/MetaExtractor/data/data-review-tool
+```
+
+### Input
+The expected inputs are mounted onto the newly created container as volumes and can be dumped in the `data/data-review-tool` folder. The artifacts required by the data review tool to verify a batch of processed articles are:
+- A parquet file containing the outputs from the article relevance prediction component.
+- A zipped file containing the outputs from the named entity extraction component.
+
+### Output
+Once the articles have been verified and the container has been destroyed, we update the same parquet file referenced in the `Input` with the extracted (predicted by the model) and verified (correct by data steward) entities.
diff --git a/docker/data-review-tool/requirements.txt b/docker/data-review-tool/requirements.txt
@@ -0,0 +1,8 @@
+dash==2.9.3
+dash_bootstrap_components==1.4.1
+dash_iconify==0.1.2
+dash_mantine_components==0.12.1
+numpy==1.24.3
+pandas==1.5.3
+plotly==5.14.1
+seaborn==0.12.2
diff --git a/docker/entity-extraction-pipeline/Dockerfile b/docker/entity-extraction-pipeline/Dockerfile
@@ -0,0 +1,47 @@
+# Use the official Python 3.10 image as the base image
+FROM python:3.10
+
+# Set the working directory inside the container
+WORKDIR /app/
+
+# Copy the requirements file to the container
+COPY docker/entity-extraction-pipeline/requirements.txt .
+
+# Install the required Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m nltk.downloader stopwords
+
+# Copy the entire repository folder into the container
+COPY src ./src
+
+# Build args
+ARG HF_NER_MODEL_NAME
+ARG SPACY_NER_MODEL_NAME
+
+# Set env variables for when running the container
+ENV HF_NER_MODEL_NAME=${HF_NER_MODEL_NAME}
+ENV SPACY_NER_MODEL_NAME=${SPACY_NER_MODEL_NAME}
+ENV USE_NER_MODEL_TYPE=huggingface
+ENV MAX_ARTICLES=-1
+ENV MAX_SENTENCES=-1
+
+# Copy in the model defined by the env variable NER_MODEL_NAME from models folder
+COPY models/ner/${HF_NER_MODEL_NAME} ./models/ner/${HF_NER_MODEL_NAME} 
+COPY models/ner/${SPACY_NER_MODEL_NAME} ./models/ner/${SPACY_NER_MODEL_NAME}
+
+# non-root user control inspired from here: https://stackoverflow.com/questions/66349101/docker-non-root-user-does-not-have-writing-permissions-when-using-volumes
+# Create a non-root user that owns the input/outputs directory by default
+RUN useradd -r extraction-user          # no specific user ID
+RUN mkdir /inputs && chown extraction-user /inputs
+RUN mkdir /outputs && chown extraction-user /outputs
+# Mount the "inputs" and "outputs" folders as volumes
+VOLUME ["/inputs", "/outputs"]
+
+# Set the entry point and command to run the script
+USER extraction-user  
+RUN ls -alp /app
+ENTRYPOINT python src/pipeline/entity_extraction_pipeline.py \
+    --article_text_path /inputs/ \
+    --output_path /outputs/ \
+    --max_articles ${MAX_ARTICLES} \
+    --max_sentences ${MAX_SENTENCES}
diff --git a/docker/entity-extraction-pipeline/README.md b/docker/entity-extraction-pipeline/README.md
@@ -0,0 +1,46 @@
+# Meta Extractor Entity Extraction Pipeline Docker Image
+
+This docker image contains the models and code required to run entity extraction from research articles on the xDD system. It assumes the following:
+1. The raw text is input in the `nlp352` TSV format with either a single article per file or multiple articles denoted by GDD ID
+   -  like this sample data from xDD [Link to Sample Data](https://github.com/UW-xDD/xdd-docker-recipe/tree/master/sample_data/nlp352)
+2. The raw input data is mounted as a volume to the docker folder `/app/inputs/`
+3. The expected output location is mounted as a volume to the docker folder `/app/outputs/`
+4. A single JSON file per article is exported into the output folder along with a `.log` file for the processing run.
+5. An environment variable `LOG_OUTPUT_DIR` is set to the path of the output folder. This is used to write the log file. Default is the directory from which the docker container is run.
+
+## Additional Options Enabled by Environment Variables
+
+The following environment variables can be set to change the behavior of the pipeline:
+- `USE_NER_MODEL_TYPE`: This variable can be set to `spacy` or `huggingface` to change the NER model used. The default is `huggingface`. This will be used to run batches with each model to evaluate final performance.
+- `MAX_SENTENCES`: This variable can be set to a number to limit the number of sentences processed per article. This is useful for testing and debugging. The default is `-1` which means no limit.
+- `MAX_ARTICLES`: This variable can be set to a number to limit the number of articles processed. This is useful for testing and debugging. The default is `-1` which means no limit.
+
+## Sample Docker Run & Compose Setup
+
+Below is a sample docker run command for running the image:
+- the `$(id -u)` is used to run the docker container as the current user so that the output files are not owned by root
+- the `LOG_OUTPUT_DIR="../outputs/"` is different from the docker compose as it is relative to the current directory which from Docker run starts in `app` folder
+- for git bash on windows the `/${PWD}` is used to get the current directory and the forward slash is important to get the correct path
+```bash
+docker run -u $(id -u) -p 5000:5000 -v /${PWD}/data/entity-extraction/raw/original_files/:/inputs/ -v /${PWD}/data/entity-extraction/processed/processed_articles/:/outputs/ --env LOG_OUTPUT_DIR="../outputs/" metaextractor-entity-extraction-pipeline:v0.0.2
+```
+
+Below is a sample docker compose configuration for running the image:
+```yaml
+version: "0.0.1"
+services:
+  entity-extraction-pipeline:
+    image: metaextractor-entity-extraction-pipeline:v0.0.1
+    build: 
+        ...
+    ports:
+      - "5000:5000"
+    volumes:
+    - ./data/raw/:/app/inputs/
+    - ./data/processed/:/app/outputs/
+    environment:
+      - USE_NER_MODEL_TYPE=huggingface
+      - LOG_OUTPUT_DIR=/app/outputs/
+      - MAX_SENTENCES=20
+      - MAX_ARTICLES=1
+```
diff --git a/docker/entity-extraction-pipeline/requirements.txt b/docker/entity-extraction-pipeline/requirements.txt
@@ -0,0 +1,14 @@
+# python version 3.10
+pandas==2.0.1
+pytest~=7.3
+seaborn~=0.12
+seqeval==1.2.2
+nltk==3.8.1
+spacy==3.5.3
+docopt-ng~=0.8
+transformers~=4.24
+numpy~=1.23
+python-dotenv~=1.0
+tqdm~=4.65
+torch~=1.12
+spacy-transformers~=1.1
diff --git a/models/.gitkeep b/models/.gitkeep
diff --git a/models/ner/.gitkeep b/models/ner/.gitkeep