Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ fastapi>=0.78.0
filetype
Flask
frozendict>=2.4.0
gitpython
gdown
httpx>=0.22.0
importlib_metadata ; python_version < '3.8'
Expand Down
19 changes: 13 additions & 6 deletions src/ocrd/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import requests
from gdown.parse_url import parse_url as gparse_url
from gdown.download import get_url_from_gdrive_confirmation
from git import Repo
from yaml import safe_load, safe_dump

# pylint: disable=wrong-import-position
Expand All @@ -32,7 +33,7 @@
from ocrd_validators import OcrdResourceListValidator
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
from ocrd_utils.constants import RESOURCES_DIR_SYSTEM, RESOURCE_TYPES, MIME_TO_EXT
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
from ocrd_utils.os import get_processor_resource_types, is_git_url, list_all_resources, pushd_popd, get_ocrd_tool_json
from .constants import RESOURCE_USER_LIST_COMMENT


Expand Down Expand Up @@ -298,11 +299,17 @@ def _download_impl(log: Logger, url: str, filename):
url = get_url_from_gdrive_confirmation(r.text)
except RuntimeError as e:
log.warning(f"Cannot unwrap Google Drive URL: {e}")
with open(filename, 'wb') as f:
with requests.get(url, stream=True) as r:
r.raise_for_status()
for data in r.iter_content(chunk_size=4096):
f.write(data)
if is_git_url(url):
log.info("Cloning a git repository")
repo = Repo.clone_from(url, filename, depth=1)
# keep only the checkout
rmtree(join(filename, '.git'))
else:
with open(filename, 'wb') as f:
with requests.get(url, stream=True) as r:
r.raise_for_status()
for data in r.iter_content(chunk_size=4096):
f.write(data)
except Exception as e:
rmtree(filename, ignore_errors=True)
Path(filename).unlink(missing_ok=True)
Expand Down
4 changes: 3 additions & 1 deletion src/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@

filesystem-related utilities

* :py:func:`is_string`,
* :py:func:`is_git_url`,
:py:func:`is_string`,
:py:func:`membername`,
:py:func:`concat_padded`,
:py:func:`nth_url_segment`,
Expand Down Expand Up @@ -189,6 +190,7 @@
guess_media_type,
list_all_resources,
is_file_in_directory,
is_git_url,
list_resource_candidates,
atomic_write,
pushd_popd,
Expand Down
12 changes: 11 additions & 1 deletion src/ocrd_utils/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
'abspath',
'directory_size',
'is_file_in_directory',
'is_git_url',
'get_ocrd_tool_json',
'get_moduledir',
'get_processor_resource_types',
Expand All @@ -27,7 +28,7 @@
from pathlib import Path
from os.path import abspath as abspath_, join
from zipfile import ZipFile
from subprocess import run, PIPE
from subprocess import run, PIPE, CalledProcessError
from mimetypes import guess_type as mimetypes_guess
from filetype import guess as filetype_guess
from fnmatch import filter as apply_glob
Expand Down Expand Up @@ -79,6 +80,15 @@ def unzip_file_to_dir(path_to_zip : Union[str, PathLike], output_directory : str
z.extractall(output_directory)


@lru_cache()
def is_git_url(url: str) -> bool:
try:
run(['git', 'ls-remote', '--exit-code', '-q', '-h', url], check=True)
except CalledProcessError:
return False
return True


@lru_cache()
def get_ocrd_tool_json(executable : str) -> Dict[str, Any]:
"""
Expand Down