Skip to content

Commit

Permalink
add control env variables, update README
Browse files Browse the repository at this point in the history
  • Loading branch information
lunakv committed Mar 5, 2023
1 parent 9e29f20 commit c1b0744
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 11 deletions.
6 changes: 6 additions & 0 deletions .env_EXAMPLE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ DB_PASS=password
DB_HOST=localhost
DB_DATABASE=academy_ruins

# Tika configuration (required for parsing MTRs)
USE_TIKA=1
# By default, the parser downloads and locally runs a Tika server as needed (this requires Java)
# Uncomment (and adjust) the following line only if you want to connect to an existing Tika server
# TIKA_URL=http://localhost:9998

# Pushover configuration for notifications
USE_PUSHOVER=0
PUSHOVER_APP_TOKEN=
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ It is used to power the [Academy Ruins](https://github.com/lunakv/academyruins)
- Python 3.10 or later
- PostgreSQL 14
- (optional) [Pushover](https://pushover.net/) account
- (recommended) Working Java installation

### Installation
1. Install the [Poetry](https://python-poetry.org/docs/#installation) package manager
Expand Down
13 changes: 12 additions & 1 deletion app/parsing/mtr/extract_mtr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
import os
import re
from dataclasses import asdict
from pathlib import Path

from dotenv import load_dotenv
from tika import parser

from app.utils.models import MtrSubsection, MtrAuxiliarySection, MtrNumberedSection
Expand Down Expand Up @@ -139,7 +141,15 @@ def build_structure(chunks: [MtrSubsection]):


def extract(filepath: Path | str) -> [dict]:
content = parser.from_file(str(filepath))["content"]
if os.environ.get("USE_TIKA") != "1":
return None

args = [str(filepath)]
tika_server = os.environ.get("TIKA_URL")
if tika_server:
args.append(tika_server)

content = parser.from_file(*args)["content"]
content = remove_page_nums(trim_content(content))
chunks = split_into_chunks(content)

Expand All @@ -155,6 +165,7 @@ def extract(filepath: Path | str) -> [dict]:


if __name__ == "__main__":
load_dotenv("/home/vaasa/rules-api/.env")
parsed = extract("./in.pdf")
with open("./out.json", "w") as file:
json.dump(parsed, file)
21 changes: 11 additions & 10 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,25 @@ services:
ports:
- "8000:80"
env_file:
# dotenv isn't run in production
# load_dotenv isn't run in production
- .env
# environment:
# # overrides for values set in .env, if required
# ADMIN_KEY: "super-secret-value"
# DB_USER: server
# DB_PASS: password
# DB_HOST: db
# DB_DATABASE: academy_ruins
environment:
# overrides for values set in .env, if required
USE_TIKA: 1
TIKA_URL: http://tika:9998
volumes:
- ./app:/code/app:rw
- ./generated:/code/app/resources/generated:rw


db:
image: postgres
restart: always
ports:
- 5432:5432
environment:
POSTGRES_PASSWORD: password
POSTGRES_PASSWORD: password

tika:
image: apache/tika
ports:
- 9998:9998

0 comments on commit c1b0744

Please sign in to comment.