Skip to content

Commit

Permalink
Factor out download_url and wrap in lru_cache
Browse files Browse the repository at this point in the history
  • Loading branch information
amywieliczka committed Sep 24, 2024
1 parent 5fa3b9f commit 256f627
Showing 1 changed file with 104 additions and 84 deletions.
188 changes: 104 additions & 84 deletions content_harvester/by_record.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import hashlib
import os
from datetime import datetime
from typing import Optional
from typing import Optional, Any
from functools import lru_cache
from dataclasses import dataclass, asdict

from PIL import Image
from PIL import UnidentifiedImageError
Expand Down Expand Up @@ -81,19 +83,93 @@ def harvest_record_content(
media_source = record.get('media_source', {})
media_source_url = media_source.get('url')
if media_source_url:
request = {'url': media_source_url, 'auth': auth}
request = ContentRequest(media_source_url, auth)
record['media'] = create_media_component(collection_id, request, media_source)

thumbnail_src = record.get('thumbnail_source', get_thumb_src(record))
thumbnail_src_url = thumbnail_src.get('url')
if thumbnail_src_url:
request = {'url': thumbnail_src_url, 'auth': auth}
request = ContentRequest(media_source_url, auth)
record['thumbnail'] = create_thumbnail_component(collection_id, request, thumbnail_src, record['calisphere-id'])

return record


def get_dimensions(filepath: str, calisphere_id: str = 'not provided') -> Optional[tuple[int, int]]:
@dataclass(frozen=True)
class ContentRequest(object):
"""
An immutable, hashable object representing a request for content.
This is used as the cache key in download_url. Since auth can be
Any, we need to define our own __hash__ and __eq__ methods for use
with the lru cache.
"""
url: str
auth: Any

def __hash__(self):
return hash(self.url)

def __eq__(self, other):
return self.url == other.url


@lru_cache(maxsize=10)
def download_url(request: ContentRequest):
"""
Download a file from a given URL and return:
{
'path': str, # local filesystem path to the downloaded
content file
'md5': str, # md5 hash of the content file
'size': int # size of the downloaded file in bytes
'Content-Type': str, # Content-Type header from the response
'ETag': str, # ETag header from the response
'Last-Modified': str, # Last-Modified header from the response
}
This is cached in an in-memory lru_cache to avoid downloading the
same url multiple times - as is the case when the media file is a
video or pdf file and the thumbnail file is a jpg derived from the
same video or pdf file. The cache is limited to 10 entries, but
since we first download the media source file and then immediately
download the thumbnail source file, I think we could limit this to 2
items.
"""
get_request = asdict(request)
get_request.update({
"stream": True,
"timeout": (12.05, (60 * 10) + 0.05) # connect, read
})
response = http_session.get(**get_request)
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f"Error downloading {request.url}: {e}")
return None

local_destination = f"/tmp/{get_url_basename(request.url)}"
hasher = hashlib.new('md5')
source_size = 0
with open(local_destination, 'wb') as f:
for block in response.iter_content(1024 * hasher.block_size):
hasher.update(block)
f.write(block)
source_size += len(block)
md5 = hasher.hexdigest()

return {
'path': local_destination,
'md5': md5,
'size': source_size,
'Content-Type': response.headers.get('Content-Type'),
'ETag': response.headers.get('ETag'),
'Last-Modified': response.headers.get('Last-Modified'),
}


def get_dimensions(
filepath: str,
calisphere_id: str = 'not provided') -> Optional[tuple[int, int]]:
try:
return Image.open(filepath).size
except UnidentifiedImageError as e:
Expand All @@ -114,61 +190,35 @@ def get_dimensions(filepath: str, calisphere_id: str = 'not provided') -> Option
return None


def create_media_component(collection_id, request: dict, media_source: dict[str, str]) -> Optional[dict[str, str]]:
def create_media_component(
collection_id,
request: ContentRequest,
media_source: dict[str, str]) -> Optional[dict[str, str]]:
'''
download source file to local disk
'''
url = request['url']
if request.get('auth') and urlparse(url).scheme != 'https':
url = request.url
if request.auth and urlparse(url).scheme != 'https':
raise Exception(f"Basic auth not over https is a bad idea! {url}")
head_resp = http_session.head(**request)
head_resp = http_session.head(**asdict(request))

media_component = content_component_cache.get('|'.join([
collection_id,
request['url'],
request.url,
'media',
head_resp.headers.get('ETag', ''),
head_resp.headers.get('Last-Modified', '')]))
if media_component:
media_component['from-cache'] = True
print(f"Retrieved media component from cache for {request['url']}")
print(f"Retrieved media component from cache for {url}")
return media_component

media_dest_filename = media_source.get('filename', get_url_basename(url))

media_source_component = in_memory_cache.get(request['url'])
media_source_component = download_url(request)
if not media_source_component:
request.update({
"stream": True,
"timeout": (12.05, (60 * 10) + 0.05) # connect, read
})
response = http_session.get(**request)
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f"Error downloading {url}: {e}")
return None

local_destination = f"/tmp/{get_url_basename(request['url'])}"
hasher = hashlib.new('md5')
source_size = 0
with open(local_destination, 'wb') as f:
for block in response.iter_content(1024 * hasher.block_size):
hasher.update(block)
f.write(block)
source_size += len(block)
md5 = hasher.hexdigest()

media_source_component = {
'path': local_destination,
'md5': md5,
'Content-Type': response.headers.get('Content-Type'),
'ETag': response.headers.get('ETag'),
'Last-Modified': response.headers.get('Last-Modified'),
'size': source_size
}
in_memory_cache[request['url']] = media_source_component

return None

media_tmp_filepath = media_source_component['path']
media_component = dict()
if media_source.get('nuxeo_type') == 'SampleCustomPicture':
Expand Down Expand Up @@ -202,69 +252,39 @@ def create_media_component(collection_id, request: dict, media_source: dict[str,
})
content_component_cache['|'.join([
collection_id,
request['url'],
request.url,
'media',
head_resp.headers.get('Etag', ''),
head_resp.headers.get('Last-Modified', '')
])] = media_component
print(f"Created media component for {request['url']}")
print(f"Created media component for {request.url}")
media_component['from-cache'] = False
return media_component


def create_thumbnail_component(collection_id, request: dict, thumb_src: dict[str, str], record_context) -> Optional[dict[str, str]]:
def create_thumbnail_component(collection_id, request: ContentRequest, thumb_src: dict[str, str], record_context) -> Optional[dict[str, str]]:
'''
download source file to local disk
'''
url = request['url']
if request.get('auth') and urlparse(url).scheme != 'https':
url = request.url
if request.auth and urlparse(url).scheme != 'https':
raise Exception(f"Basic auth not over https is a bad idea! {url}")

head_resp = http_session.head(**request)
head_resp = http_session.head(**asdict(request))
thumb_component = content_component_cache.get('|'.join([
collection_id,
request['url'],
request.url,
'thumbnail',
head_resp.headers.get('ETag', ''),
head_resp.headers.get('Last-Modified', '')]))
if thumb_component:
thumb_component['from-cache'] = True
print(f"Retrieved thumbnail component from cache for {request['url']}")
print(f"Retrieved thumbnail component from cache for {request.url}")
return thumb_component

thumb_source_component = in_memory_cache.get(request['url'])
thumb_source_component = download_url(request)
if not thumb_source_component:
request.update({
"stream": True,
"timeout": (12.05, (60 * 10) + 0.05) # connect, read
})
response = http_session.get(**request)
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f"Error downloading {url}: {e}")
return None

local_destination = f"/tmp/{get_url_basename(request['url'])}"
hasher = hashlib.new('md5')
source_size = 0
with open(local_destination, 'wb') as f:
for block in response.iter_content(1024 * hasher.block_size):
hasher.update(block)
f.write(block)
source_size += len(block)
md5 = hasher.hexdigest()

thumb_source_component = {
'path': local_destination,
'md5': md5,
'Content-Type': response.headers.get('Content-Type'),
'ETag': response.headers.get('ETag'),
'Last-Modified': response.headers.get('Last-Modified'),
'size': source_size
}
in_memory_cache[request['url']] = thumb_source_component

return None

thumbnail_tmp_filepath = thumb_source_component['path']
thumbnail_md5 = thumb_source_component['md5']
Expand Down Expand Up @@ -305,12 +325,12 @@ def create_thumbnail_component(collection_id, request: dict, thumb_src: dict[str
}
content_component_cache['|'.join([
collection_id,
request['url'],
request.url,
'thumbnail',
head_resp.headers.get('Etag', ''),
head_resp.headers.get('Last-Modified', '')
])] = thumb_component
print(f"Created thumbnail component for {request['url']}")
print(f"Created thumbnail component for {request.url}")
thumb_component['from-cache'] = False
return thumb_component

Expand Down

0 comments on commit 256f627

Please sign in to comment.