-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
- Loading branch information
1 parent
18aad34
commit dc9e759
Showing
12 changed files
with
315 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import logging | ||
import warnings | ||
from pathlib import Path | ||
from typing import Annotated | ||
|
||
import typer | ||
|
||
from docling.datamodel.settings import settings | ||
from docling.models.code_formula_model import CodeFormulaModel | ||
from docling.models.document_picture_classifier import DocumentPictureClassifier | ||
from docling.models.easyocr_model import EasyOcrModel | ||
from docling.models.layout_model import LayoutModel | ||
from docling.models.rapid_ocr_model import RapidOcrModel | ||
from docling.models.table_structure_model import TableStructureModel | ||
|
||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") | ||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") | ||
|
||
_log = logging.getLogger(__name__) | ||
from rich.console import Console | ||
|
||
console = Console() | ||
err_console = Console(stderr=True) | ||
|
||
|
||
app = typer.Typer( | ||
name="Docling model helper", | ||
add_completion=False, | ||
pretty_exceptions_enable=False, | ||
) | ||
|
||
|
||
@app.command("download") | ||
def download( | ||
output_dir: Annotated[ | ||
Path, | ||
typer.Option( | ||
..., | ||
"-o", | ||
"--output-dir", | ||
help="The directory where all the models are downloaded.", | ||
), | ||
] = settings.cache_dir | ||
/ "models", | ||
force: Annotated[ | ||
bool, typer.Option(..., help="If true, the download will be forced") | ||
] = False, | ||
quite: Annotated[ | ||
bool, | ||
typer.Option( | ||
..., | ||
"-q", | ||
help="No extra output is generated, the CLI print only the directory with the cached models.", | ||
), | ||
] = False, | ||
layout: Annotated[ | ||
bool, | ||
typer.Option(..., help="If true, the layout model weights are downloaded."), | ||
] = True, | ||
tableformer: Annotated[ | ||
bool, | ||
typer.Option( | ||
..., help="If true, the tableformer model weights are downloaded." | ||
), | ||
] = True, | ||
code_formula: Annotated[ | ||
bool, | ||
typer.Option( | ||
..., help="If true, the code formula model weights are downloaded." | ||
), | ||
] = True, | ||
picture_classifier: Annotated[ | ||
bool, | ||
typer.Option( | ||
..., help="If true, the picture classifier model weights are downloaded." | ||
), | ||
] = True, | ||
easyocr: Annotated[ | ||
bool, | ||
typer.Option(..., help="If true, the easyocr model weights are downloaded."), | ||
] = True, | ||
rapidocr: Annotated[ | ||
bool, | ||
typer.Option(..., help="If true, the rapidocr model weights are downloaded."), | ||
] = True, | ||
): | ||
# Make sure the folder exists | ||
output_dir.mkdir(exist_ok=True, parents=True) | ||
|
||
show_progress = not quite | ||
|
||
if layout: | ||
if not quite: | ||
typer.secho(f"Downloading layout model...", fg="blue") | ||
LayoutModel.download_models_hf( | ||
local_dir=output_dir / LayoutModel._model_repo_folder, | ||
force=force, | ||
progress=show_progress, | ||
) | ||
|
||
if tableformer: | ||
if not quite: | ||
typer.secho(f"Downloading tableformer model...", fg="blue") | ||
TableStructureModel.download_models_hf( | ||
local_dir=output_dir / TableStructureModel._model_repo_folder, | ||
force=force, | ||
progress=show_progress, | ||
) | ||
|
||
if picture_classifier: | ||
if not quite: | ||
typer.secho(f"Downloading picture classifier model...", fg="blue") | ||
DocumentPictureClassifier.download_models_hf( | ||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, | ||
force=force, | ||
progress=show_progress, | ||
) | ||
|
||
if code_formula: | ||
if not quite: | ||
typer.secho(f"Downloading code formula model...", fg="blue") | ||
CodeFormulaModel.download_models_hf( | ||
local_dir=output_dir / CodeFormulaModel._model_repo_folder, | ||
force=force, | ||
progress=show_progress, | ||
) | ||
|
||
if easyocr: | ||
if not quite: | ||
typer.secho(f"Downloading easyocr models...", fg="blue") | ||
EasyOcrModel.download_models( | ||
local_dir=output_dir / EasyOcrModel._model_repo_folder, | ||
force=force, | ||
progress=show_progress, | ||
) | ||
|
||
if quite: | ||
typer.echo(output_dir) | ||
else: | ||
typer.secho(f"All models downloaded in the directory {output_dir}.", fg="green") | ||
|
||
console.print( | ||
"\n", | ||
"Docling can now be configured for running offline using the local artifacts.\n\n", | ||
"Using the CLI:", | ||
"`docling --artifacts-path={output_dir} FILE`", | ||
"\n", | ||
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.", | ||
) | ||
|
||
|
||
@app.command(hidden=True) | ||
def other(): | ||
raise NotImplementedError() | ||
|
||
|
||
click_app = typer.main.get_command(app) | ||
|
||
if __name__ == "__main__": | ||
app() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from pathlib import Path | ||
|
||
from docling.datamodel.base_models import InputFormat | ||
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions | ||
from docling.document_converter import DocumentConverter, PdfFormatOption | ||
|
||
# The location of the local artifacts, e.g. from the `docling-models download` command | ||
artifacts_path = Path("PATH TO MODELS") # <-- fill me | ||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) | ||
pipeline_options.ocr_options = EasyOcrOptions( | ||
download_enabled=False, model_storage_directory=str(artifacts_path / "EasyOcr") | ||
) | ||
|
||
doc_converter = DocumentConverter( | ||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} | ||
) | ||
|
||
result = doc_converter.convert("FILE TO CONVERT") # <-- fill me | ||
print(result.document.export_to_markdown()) |
Oops, something went wrong.