Skip to content

Commit

Permalink
chore: Update Dockerfile.unstructured to include system dependencies …
Browse files Browse the repository at this point in the history
…and Python packages (#1169)
  • Loading branch information
shreyaspimpalgaonkar authored Sep 13, 2024
1 parent d19ca35 commit 1bf7a36
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 0 deletions.
54 changes: 54 additions & 0 deletions .github/workflows/build-unst-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Build and Publish Unstructured Docker Image

on:
workflow_dispatch:

env:
REGISTRY_BASE: ragtoriches

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install toml package
run: pip install toml

- name: Determine version
id: version
run: |
echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Docker Auth
uses: docker/login-action@v3
with:
username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}

- name: Build and push image
uses: docker/build-push-action@v5
with:
context: ./py
file: ./py/Dockerfile.unstructured
platforms: linux/amd64,linux/arm64
push: true
tags: |
${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }}
${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
provenance: false
sbom: false

- name: Verify manifest
run: |
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }}
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
29 changes: 29 additions & 0 deletions py/Dockerfile.unstructured
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM python:3.10-slim AS builder

# Install system dependencies (including those needed for Unstructured and OpenCV)
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
poppler-utils libmagic1 pandoc libreoffice \
libgl1-mesa-glx libglib2.0-0 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

WORKDIR /app

RUN pip install --no-cache-dir unstructured "unstructured[all-docs]"


RUN python -c "from unstructured.partition.model_init import initialize; initialize()"

RUN pip install gunicorn uvicorn fastapi httpx

COPY core/integrations/unstructured/main.py .

EXPOSE 7275

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"]

0 comments on commit 1bf7a36

Please sign in to comment.