11"""
2- FileSizeEnrichmentRepository component for adding file size information to project pages .
2+ File enrichment repository components .
33
4- This component wraps another repository and automatically enriches file metadata
5- with size information by making HTTP HEAD requests to files that don't already
6- have size information.
4+ This module provides base classes for enriching file metadata in project pages,
5+ with a concrete implementation for HTTP HEAD-based enrichment.
76"""
87
8+ from __future__ import annotations
9+
10+ import abc
911import asyncio
1012from dataclasses import replace
1113import logging
12- import typing
1314
1415import httpx
1516from simple_repository import SimpleRepository , model
2021logger = logging .getLogger (__name__ )
2122
2223
23- class FileSizeEnrichmentRepository (RepositoryContainer ):
24+ class FileEnrichingRepository (RepositoryContainer ):
2425 """
25- Repository component that enriches file metadata with size information .
26+ Base class to enrich Files in parallel .
2627
27- This component automatically adds size information to files that don't already
28- have it by making HTTP HEAD requests. It maintains parallelism for efficiency
29- while respecting concurrency limits .
28+ This component handles the mechanics of enriching file metadata in parallel,
29+ without any assumptions about how the enrichment is performed. Subclasses
30+ implement the _enrich_file method to define enrichment logic .
3031 """
3132
32- def __init__ (
33- self ,
34- source : SimpleRepository ,
35- http_client : httpx .AsyncClient ,
36- * ,
37- max_concurrent_requests : int = 10 ,
38- ) -> None :
39- """
40- Initialize the FileSizeEnrichmentRepository.
41-
42- Parameters
43- ----------
44- source: The underlying repository to wrap
45-
46- http_client: HTTP client for making HEAD requests
47-
48- max_concurrent_requests: Maximum number of concurrent HEAD requests
49- """
50- super ().__init__ (source )
51- self .http_client = http_client
52- self .semaphore = asyncio .Semaphore (max_concurrent_requests )
53-
5433 @override
5534 async def get_project_page (
5635 self ,
@@ -59,89 +38,175 @@ async def get_project_page(
5938 request_context : model .RequestContext | None = None ,
6039 ) -> model .ProjectDetail :
6140 """
62- Get project page with file sizes enriched.
41+ Get project page with enriched files .
6342
64- Files that don't have size information will have their sizes fetched
65- via HTTP HEAD requests in parallel.
43+ Files will be enriched in parallel according to the _enrich_file implementation.
6644 """
6745 project_page = await super ().get_project_page (
6846 project_name , request_context = request_context
6947 )
7048
71- # Identify files that need size information
72- files_needing_size = [
73- file for file in project_page .files if not file .size and file .url
74- ]
49+ enriched_files = await self ._enrich_files (project_page .files )
7550
76- if not files_needing_size :
77- # No files need size information, return as-is
78- return project_page
51+ if enriched_files is not project_page .files :
52+ project_page = replace (project_page , files = enriched_files )
7953
80- # Fetch sizes for files that need them
81- size_info = await self ._fetch_file_sizes (files_needing_size )
54+ return project_page
8255
83- # Create new files with updated size information
84- enriched_files = []
85- for file in project_page .files :
86- if file .filename in size_info :
87- file = replace (file , size = size_info [file .filename ])
88- enriched_files .append (file )
56+ @abc .abstractmethod
57+ async def _enrich_file (self , file : model .File ) -> model .File | None :
58+ """
59+ Enrich a single file with metadata.
8960
90- return replace ( project_page , files = tuple ( enriched_files ))
61+ Subclasses must implement this method to define enrichment logic.
9162
92- async def _fetch_file_sizes (
93- self , files : typing .List [model .File ]
94- ) -> typing .Dict [str , int ]:
95- """
96- Fetch file sizes for multiple files in parallel.
63+ Parameters
64+ ----------
65+ file: The file to enrich
9766
98- Args:
99- files: List of files to fetch sizes for
67+ Returns
68+ -------
69+ The enriched file, or None if no enrichment is needed/possible
70+ """
71+ ...
10072
101- Returns:
102- Dictionary mapping filename to size in bytes
73+ async def _enrich_files (
74+ self , files : tuple [model .File , ...]
75+ ) -> tuple [model .File , ...]:
10376 """
77+ Enrich multiple files in parallel.
10478
105- async def fetch_single_file_size (
106- file : model .File ,
107- ) -> typing .Tuple [str , typing .Optional [int ]]:
108- """Fetch size for a single file with semaphore protection."""
109- async with self .semaphore :
110- try :
111- logger .debug (f"Fetching size for { file .filename } from { file .url } " )
112-
113- # Make HEAD request to get Content-Length
114- response = await self .http_client .head (
115- file .url , follow_redirects = True , headers = {}
116- )
117- response .raise_for_status ()
118-
119- content_length = response .headers .get ("Content-Length" )
120- if content_length :
121- return file .filename , int (content_length )
122- else :
123- logger .warning (f"No Content-Length header for { file .filename } " )
124- return file .filename , None
125-
126- except BaseException as e :
127- logger .warning (f"Failed to get size for { file .filename } : { e } " )
128- return file .filename , None
79+ Parameters
80+ ----------
81+ files: Tuple of files to enrich
12982
83+ Returns
84+ -------
85+ Tuple of enriched files. If no enrichment took place to original files
86+ tuple instance is returned.
87+ """
13088 # Create tasks for all files
131- tasks = [fetch_single_file_size (file ) for file in files ]
89+ tasks = [self . _enrich_file (file ) for file in files ]
13290
13391 # Wait for all tasks to complete
13492 results = await asyncio .gather (* tasks , return_exceptions = True )
13593
136- # Process results, filtering out failures
137- size_info = {}
138- for result in results :
94+ # Process results, converting exceptions to None
95+ enriched_files = []
96+ files_were_enriched = False
97+
98+ # Create new files with updated information
99+ for orig_file , result in zip (files , results ):
139100 if isinstance (result , BaseException ):
140- logger .warning (f"Exception occurred during size fetching: { result } " )
141- continue
101+ logger .warning (f"Exception occurred during file enrichment: { result } " )
102+ enriched_files .append (orig_file )
103+ elif result is None :
104+ enriched_files .append (orig_file )
105+ else :
106+ files_were_enriched = True
107+ enriched_files .append (result )
108+
109+ if not files_were_enriched :
110+ # Return the original files tuple if no changes. This is an optimisation,
111+ # but it also means that we can do `enriched_files is files`.
112+ return files
142113
143- filename , size = result
144- if size is not None :
145- size_info [filename ] = size
114+ return tuple (enriched_files )
146115
147- return size_info
116+
117+ class FileSizeEnrichmentRepository (FileEnrichingRepository ):
118+ """
119+ Repository component that enriches file metadata using HTTP HEAD requests.
120+
121+ This component makes HTTP HEAD requests to fetch metadata from response headers.
122+ It uses a semaphore to limit concurrent requests and provides a template method
123+ for processing response headers that can be easily overridden in subclasses.
124+ """
125+
126+ def __init__ (
127+ self ,
128+ source : SimpleRepository ,
129+ http_client : httpx .AsyncClient ,
130+ * ,
131+ max_concurrent_requests : int = 10 ,
132+ ) -> None :
133+ """
134+ Initialize the FileSizeEnrichmentRepository.
135+
136+ Parameters
137+ ----------
138+ source: The underlying repository to wrap
139+
140+ http_client: HTTP client for making HEAD requests
141+
142+ max_concurrent_requests: Maximum number of concurrent HEAD requests
143+ """
144+ super ().__init__ (source )
145+ self .http_client = http_client
146+ self .semaphore = asyncio .Semaphore (max_concurrent_requests )
147+
148+ @override
149+ async def _enrich_file (self , file : model .File ) -> model .File | None :
150+ """
151+ Enrich a single file by making an HTTP HEAD request.
152+
153+ This checks if enrichment is needed, makes the HEAD request with semaphore
154+ control, and delegates header processing to _enrich_with_resource_head_response.
155+
156+ Parameters
157+ ----------
158+ file: The file to enrich
159+
160+ Returns
161+ -------
162+ The enriched file, or None if no enrichment is needed/possible
163+ """
164+ # Skip files that already have size information
165+ if file .size is not None :
166+ return None
167+
168+ # Skip files without URLs (can't fetch metadata)
169+ if not file .url :
170+ return None
171+
172+ async with self .semaphore :
173+ try :
174+ logger .debug (
175+ f"Fetching HEAD metadata for { file .filename } from { file .url } "
176+ )
177+
178+ response = await self .http_client .head (
179+ file .url , follow_redirects = True , headers = {}
180+ )
181+ response .raise_for_status ()
182+
183+ return self ._enrich_with_resource_head_response (file , response )
184+
185+ except BaseException as e :
186+ logger .warning (f"Failed to fetch metadata for { file .filename } : { e } " )
187+ return None
188+
189+ def _enrich_with_resource_head_response (
190+ self , file : model .File , response : httpx .Response
191+ ) -> model .File | None :
192+ """
193+ Process HTTP HEAD response headers to enrich file metadata.
194+
195+ Override this method in subclasses to extract additional metadata from headers.
196+ By default, this extracts only the file size from Content-Length.
197+
198+ Parameters
199+ ----------
200+ file: The original file
201+ response: The HTTP HEAD response
202+
203+ Returns
204+ -------
205+ The enriched file, or None if no enrichment was possible
206+ """
207+ content_length = response .headers .get ("Content-Length" )
208+ if content_length :
209+ return replace (file , size = int (content_length ))
210+ else :
211+ logger .warning (f"No Content-Length header for { file .filename } " )
212+ return None
0 commit comments