Skip to content

Commit 8f1ba18

Browse files
authored
Merge pull request #30 from simple-repository/feature/file-enrichment-base-class
Simplify the FileSizeEnrichment repository implementation
2 parents 835b8d3 + da4706c commit 8f1ba18

File tree

1 file changed

+159
-94
lines changed

1 file changed

+159
-94
lines changed
Lines changed: 159 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
"""
2-
FileSizeEnrichmentRepository component for adding file size information to project pages.
2+
File enrichment repository components.
33
4-
This component wraps another repository and automatically enriches file metadata
5-
with size information by making HTTP HEAD requests to files that don't already
6-
have size information.
4+
This module provides base classes for enriching file metadata in project pages,
5+
with a concrete implementation for HTTP HEAD-based enrichment.
76
"""
87

8+
from __future__ import annotations
9+
10+
import abc
911
import asyncio
1012
from dataclasses import replace
1113
import logging
12-
import typing
1314

1415
import httpx
1516
from simple_repository import SimpleRepository, model
@@ -20,37 +21,15 @@
2021
logger = logging.getLogger(__name__)
2122

2223

23-
class FileSizeEnrichmentRepository(RepositoryContainer):
24+
class FileEnrichingRepository(RepositoryContainer):
2425
"""
25-
Repository component that enriches file metadata with size information.
26+
Base class to enrich Files in parallel.
2627
27-
This component automatically adds size information to files that don't already
28-
have it by making HTTP HEAD requests. It maintains parallelism for efficiency
29-
while respecting concurrency limits.
28+
This component handles the mechanics of enriching file metadata in parallel,
29+
without any assumptions about how the enrichment is performed. Subclasses
30+
implement the _enrich_file method to define enrichment logic.
3031
"""
3132

32-
def __init__(
33-
self,
34-
source: SimpleRepository,
35-
http_client: httpx.AsyncClient,
36-
*,
37-
max_concurrent_requests: int = 10,
38-
) -> None:
39-
"""
40-
Initialize the FileSizeEnrichmentRepository.
41-
42-
Parameters
43-
----------
44-
source: The underlying repository to wrap
45-
46-
http_client: HTTP client for making HEAD requests
47-
48-
max_concurrent_requests: Maximum number of concurrent HEAD requests
49-
"""
50-
super().__init__(source)
51-
self.http_client = http_client
52-
self.semaphore = asyncio.Semaphore(max_concurrent_requests)
53-
5433
@override
5534
async def get_project_page(
5635
self,
@@ -59,89 +38,175 @@ async def get_project_page(
5938
request_context: model.RequestContext | None = None,
6039
) -> model.ProjectDetail:
6140
"""
62-
Get project page with file sizes enriched.
41+
Get project page with enriched files.
6342
64-
Files that don't have size information will have their sizes fetched
65-
via HTTP HEAD requests in parallel.
43+
Files will be enriched in parallel according to the _enrich_file implementation.
6644
"""
6745
project_page = await super().get_project_page(
6846
project_name, request_context=request_context
6947
)
7048

71-
# Identify files that need size information
72-
files_needing_size = [
73-
file for file in project_page.files if not file.size and file.url
74-
]
49+
enriched_files = await self._enrich_files(project_page.files)
7550

76-
if not files_needing_size:
77-
# No files need size information, return as-is
78-
return project_page
51+
if enriched_files is not project_page.files:
52+
project_page = replace(project_page, files=enriched_files)
7953

80-
# Fetch sizes for files that need them
81-
size_info = await self._fetch_file_sizes(files_needing_size)
54+
return project_page
8255

83-
# Create new files with updated size information
84-
enriched_files = []
85-
for file in project_page.files:
86-
if file.filename in size_info:
87-
file = replace(file, size=size_info[file.filename])
88-
enriched_files.append(file)
56+
@abc.abstractmethod
57+
async def _enrich_file(self, file: model.File) -> model.File | None:
58+
"""
59+
Enrich a single file with metadata.
8960
90-
return replace(project_page, files=tuple(enriched_files))
61+
Subclasses must implement this method to define enrichment logic.
9162
92-
async def _fetch_file_sizes(
93-
self, files: typing.List[model.File]
94-
) -> typing.Dict[str, int]:
95-
"""
96-
Fetch file sizes for multiple files in parallel.
63+
Parameters
64+
----------
65+
file: The file to enrich
9766
98-
Args:
99-
files: List of files to fetch sizes for
67+
Returns
68+
-------
69+
The enriched file, or None if no enrichment is needed/possible
70+
"""
71+
...
10072

101-
Returns:
102-
Dictionary mapping filename to size in bytes
73+
async def _enrich_files(
74+
self, files: tuple[model.File, ...]
75+
) -> tuple[model.File, ...]:
10376
"""
77+
Enrich multiple files in parallel.
10478
105-
async def fetch_single_file_size(
106-
file: model.File,
107-
) -> typing.Tuple[str, typing.Optional[int]]:
108-
"""Fetch size for a single file with semaphore protection."""
109-
async with self.semaphore:
110-
try:
111-
logger.debug(f"Fetching size for {file.filename} from {file.url}")
112-
113-
# Make HEAD request to get Content-Length
114-
response = await self.http_client.head(
115-
file.url, follow_redirects=True, headers={}
116-
)
117-
response.raise_for_status()
118-
119-
content_length = response.headers.get("Content-Length")
120-
if content_length:
121-
return file.filename, int(content_length)
122-
else:
123-
logger.warning(f"No Content-Length header for {file.filename}")
124-
return file.filename, None
125-
126-
except BaseException as e:
127-
logger.warning(f"Failed to get size for {file.filename}: {e}")
128-
return file.filename, None
79+
Parameters
80+
----------
81+
files: Tuple of files to enrich
12982
83+
Returns
84+
-------
85+
Tuple of enriched files. If no enrichment took place to original files
86+
tuple instance is returned.
87+
"""
13088
# Create tasks for all files
131-
tasks = [fetch_single_file_size(file) for file in files]
89+
tasks = [self._enrich_file(file) for file in files]
13290

13391
# Wait for all tasks to complete
13492
results = await asyncio.gather(*tasks, return_exceptions=True)
13593

136-
# Process results, filtering out failures
137-
size_info = {}
138-
for result in results:
94+
# Process results, converting exceptions to None
95+
enriched_files = []
96+
files_were_enriched = False
97+
98+
# Create new files with updated information
99+
for orig_file, result in zip(files, results):
139100
if isinstance(result, BaseException):
140-
logger.warning(f"Exception occurred during size fetching: {result}")
141-
continue
101+
logger.warning(f"Exception occurred during file enrichment: {result}")
102+
enriched_files.append(orig_file)
103+
elif result is None:
104+
enriched_files.append(orig_file)
105+
else:
106+
files_were_enriched = True
107+
enriched_files.append(result)
108+
109+
if not files_were_enriched:
110+
# Return the original files tuple if no changes. This is an optimisation,
111+
# but it also means that we can do `enriched_files is files`.
112+
return files
142113

143-
filename, size = result
144-
if size is not None:
145-
size_info[filename] = size
114+
return tuple(enriched_files)
146115

147-
return size_info
116+
117+
class FileSizeEnrichmentRepository(FileEnrichingRepository):
118+
"""
119+
Repository component that enriches file metadata using HTTP HEAD requests.
120+
121+
This component makes HTTP HEAD requests to fetch metadata from response headers.
122+
It uses a semaphore to limit concurrent requests and provides a template method
123+
for processing response headers that can be easily overridden in subclasses.
124+
"""
125+
126+
def __init__(
127+
self,
128+
source: SimpleRepository,
129+
http_client: httpx.AsyncClient,
130+
*,
131+
max_concurrent_requests: int = 10,
132+
) -> None:
133+
"""
134+
Initialize the FileSizeEnrichmentRepository.
135+
136+
Parameters
137+
----------
138+
source: The underlying repository to wrap
139+
140+
http_client: HTTP client for making HEAD requests
141+
142+
max_concurrent_requests: Maximum number of concurrent HEAD requests
143+
"""
144+
super().__init__(source)
145+
self.http_client = http_client
146+
self.semaphore = asyncio.Semaphore(max_concurrent_requests)
147+
148+
@override
149+
async def _enrich_file(self, file: model.File) -> model.File | None:
150+
"""
151+
Enrich a single file by making an HTTP HEAD request.
152+
153+
This checks if enrichment is needed, makes the HEAD request with semaphore
154+
control, and delegates header processing to _enrich_with_resource_head_response.
155+
156+
Parameters
157+
----------
158+
file: The file to enrich
159+
160+
Returns
161+
-------
162+
The enriched file, or None if no enrichment is needed/possible
163+
"""
164+
# Skip files that already have size information
165+
if file.size is not None:
166+
return None
167+
168+
# Skip files without URLs (can't fetch metadata)
169+
if not file.url:
170+
return None
171+
172+
async with self.semaphore:
173+
try:
174+
logger.debug(
175+
f"Fetching HEAD metadata for {file.filename} from {file.url}"
176+
)
177+
178+
response = await self.http_client.head(
179+
file.url, follow_redirects=True, headers={}
180+
)
181+
response.raise_for_status()
182+
183+
return self._enrich_with_resource_head_response(file, response)
184+
185+
except BaseException as e:
186+
logger.warning(f"Failed to fetch metadata for {file.filename}: {e}")
187+
return None
188+
189+
def _enrich_with_resource_head_response(
190+
self, file: model.File, response: httpx.Response
191+
) -> model.File | None:
192+
"""
193+
Process HTTP HEAD response headers to enrich file metadata.
194+
195+
Override this method in subclasses to extract additional metadata from headers.
196+
By default, this extracts only the file size from Content-Length.
197+
198+
Parameters
199+
----------
200+
file: The original file
201+
response: The HTTP HEAD response
202+
203+
Returns
204+
-------
205+
The enriched file, or None if no enrichment was possible
206+
"""
207+
content_length = response.headers.get("Content-Length")
208+
if content_length:
209+
return replace(file, size=int(content_length))
210+
else:
211+
logger.warning(f"No Content-Length header for {file.filename}")
212+
return None

0 commit comments

Comments
 (0)