Skip to content

Commit 5329c91

Browse files
committed
Support traditional "simple" HTML registries
1 parent 3c1da1b commit 5329c91

File tree

3 files changed

+165
-38
lines changed

3 files changed

+165
-38
lines changed

pypi_browser/app.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,16 @@ async def dispatch(
7474

7575

7676
config = starlette.config.Config()
77+
pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/')
78+
repo: pypi.PythonRepository
79+
if pypi_url.endswith('/simple'):
80+
repo = pypi.SimpleRepository(pypi_url)
81+
else:
82+
repo = pypi.LegacyJsonRepository(pypi_url)
83+
7784
pypi_config = pypi.PyPIConfig(
85+
repo=repo,
7886
cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'),
79-
pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'),
8087
)
8188

8289
templates = Jinja2Templates(
@@ -115,16 +122,25 @@ async def package(request: Request) -> Response:
115122
return RedirectResponse(request.url_for('package', package=normalized_package_name))
116123

117124
try:
118-
version_to_files = await pypi.files_for_package(pypi_config, package_name)
125+
version_to_files = await pypi.files_by_version(pypi_config, package_name)
119126
except pypi.PackageDoesNotExist:
120127
return PlainTextResponse(
121128
f'Package {package_name!r} does not exist on PyPI.',
122129
status_code=404,
123130
)
124131
else:
132+
def _version_sort_key(version: str | None) -> packaging.version.Version:
133+
if version is not None:
134+
try:
135+
return packaging.version.parse(version)
136+
except packaging.version.InvalidVersion:
137+
pass
138+
# Not really correct, but just throw everything we can't parse at the bottom.
139+
return packaging.version.Version('0.0.0')
140+
125141
version_to_files_sorted = sorted(
126142
version_to_files.items(),
127-
key=lambda item: packaging.version.parse(item[0]),
143+
key=lambda item: _version_sort_key(item[0]),
128144
reverse=True,
129145
)
130146
return templates.TemplateResponse(

pypi_browser/packaging.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,64 @@
1212
from types import TracebackType
1313

1414

15+
# Copied from distlib/wheel.py
16+
WHEEL_FILENAME_RE = re.compile(r'''
17+
(?P<nm>[^-]+)
18+
-(?P<vn>\d+[^-]*)
19+
(-(?P<bn>\d+[^-]*))?
20+
-(?P<py>\w+\d+(\.\w+\d+)*)
21+
-(?P<bi>\w+)
22+
-(?P<ar>\w+(\.\w+)*)
23+
\.whl$
24+
''', re.IGNORECASE | re.VERBOSE)
25+
26+
1527
def pep426_normalize(package_name: str) -> str:
1628
return re.sub(r'[-_.]+', '-', package_name.strip()).lower()
1729

1830

31+
def _remove_extension(name: str) -> str:
32+
if name.endswith(('gz', 'bz2')):
33+
name, _ = name.rsplit('.', 1)
34+
name, _ = name.rsplit('.', 1)
35+
return name
36+
37+
38+
def guess_version_from_filename(filename: str) -> str | None:
39+
# Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56
40+
if filename.endswith('.whl'):
41+
# TODO: Switch to packaging.utils.parse_wheel_filename which enforces
42+
# PEP440 versions for wheels.
43+
m = WHEEL_FILENAME_RE.match(filename)
44+
if m is not None:
45+
return m.group('vn')
46+
else:
47+
raise ValueError(f'Invalid package name: {filename}')
48+
else:
49+
# These don't have a well-defined format like wheels do, so they are
50+
# sort of "best effort", with lots of tests to back them up.
51+
# The most important thing is to correctly parse the name.
52+
name = _remove_extension(filename)
53+
version = None
54+
55+
if '-' in name:
56+
if name.count('-') == 1:
57+
name, version = name.split('-')
58+
else:
59+
parts = name.split('-')
60+
for i in range(len(parts) - 1, 0, -1):
61+
part = parts[i]
62+
if '.' in part and re.search('[0-9]', part):
63+
name, version = '-'.join(parts[0:i]), '-'.join(parts[i:])
64+
65+
# Possible with poorly-named files.
66+
if len(name) <= 0:
67+
raise ValueError(f'Invalid package name: {filename}')
68+
69+
assert version is None or len(version) > 0, version
70+
return version
71+
72+
1973
class UnsupportedPackageType(Exception):
2074
pass
2175

pypi_browser/pypi.py

Lines changed: 92 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import abc
12
import base64
3+
import collections
24
import contextlib
35
import dataclasses
6+
import html.parser
47
import itertools
58
import os.path
69
import typing
@@ -9,38 +12,98 @@
912
import aiofiles.os
1013
import httpx
1114

15+
from pypi_browser import packaging
16+
17+
18+
class PythonRepository(abc.ABC):
19+
20+
@abc.abstractmethod
21+
async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
22+
"""Return mapping from filename to file URL for files in a package."""
23+
24+
25+
class HTMLAnchorParser(html.parser.HTMLParser):
26+
anchors: set[str]
27+
28+
def __init__(self) -> None:
29+
super().__init__()
30+
self.anchors = set()
31+
32+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
33+
if tag == 'a':
34+
if href := dict(attrs).get('href'):
35+
self.anchors.add(href)
36+
1237

1338
@dataclasses.dataclass(frozen=True)
14-
class PyPIConfig:
15-
cache_path: str
39+
class SimpleRepository(PythonRepository):
40+
"""Old-style "simple" PyPI registry serving HTML files."""
41+
# TODO: Also handle PEP691 JSON simple repositories.
1642
pypi_url: str
1743

44+
async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
45+
async with httpx.AsyncClient() as client:
46+
resp = await client.get(
47+
f'{self.pypi_url}/{package_name}',
48+
follow_redirects=True,
49+
)
50+
if resp.status_code == 404:
51+
raise PackageDoesNotExist(package_name)
52+
parser = HTMLAnchorParser()
53+
parser.feed(resp.text)
1854

19-
class PackageDoesNotExist(Exception):
20-
pass
55+
def clean_url(url: str) -> str:
56+
parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url))
57+
return parsed._replace(fragment='').geturl()
2158

59+
return {
60+
(urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url)
61+
for url in parser.anchors
62+
}
2263

23-
async def package_metadata(
24-
config: PyPIConfig,
25-
client: httpx.AsyncClient,
26-
package: str,
27-
) -> typing.Dict[typing.Any, typing.Any]:
28-
resp = await client.get(f'{config.pypi_url}/pypi/{package}/json')
29-
if resp.status_code == 404:
30-
raise PackageDoesNotExist(package)
31-
resp.raise_for_status()
32-
return resp.json()
3364

65+
@dataclasses.dataclass(frozen=True)
66+
class LegacyJsonRepository(PythonRepository):
67+
"""Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints."""
68+
pypi_url: str
3469

35-
async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]:
36-
async with httpx.AsyncClient() as client:
37-
metadata = await package_metadata(config, client, package)
70+
async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
71+
async with httpx.AsyncClient() as client:
72+
resp = await client.get(
73+
f'{self.pypi_url}/pypi/{package_name}/json',
74+
follow_redirects=True,
75+
)
76+
if resp.status_code == 404:
77+
raise PackageDoesNotExist(package_name)
78+
resp.raise_for_status()
79+
return {
80+
file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url'])
81+
for file_ in itertools.chain.from_iterable(resp.json()['releases'].values())
82+
}
3883

39-
return {
40-
version: {file_['filename'] for file_ in files}
41-
for version, files in metadata['releases'].items()
42-
if len(files) > 0
43-
}
84+
85+
@dataclasses.dataclass(frozen=True)
86+
class PyPIConfig:
87+
repo: PythonRepository
88+
cache_path: str
89+
90+
91+
class PackageDoesNotExist(Exception):
92+
pass
93+
94+
95+
async def files_by_version(config: PyPIConfig, package: str) -> typing.Dict[str | None, typing.Set[str]]:
96+
ret = collections.defaultdict(set)
97+
for filename in await config.repo.files_for_package(package):
98+
try:
99+
version = packaging.guess_version_from_filename(filename)
100+
except ValueError:
101+
# Possible with some very poorly-formed packages that used to be
102+
# allowed on PyPI. Just skip them when this happens.
103+
pass
104+
else:
105+
ret[version].add(filename)
106+
return ret
44107

45108

46109
class CannotFindFileError(Exception):
@@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str)
81144
if await aiofiles.os.path.exists(stored_path):
82145
return stored_path
83146

84-
async with httpx.AsyncClient() as client:
85-
metadata = await package_metadata(config, client, package)
86-
87-
# Parsing versions from non-wheel Python packages isn't perfectly
88-
# reliable, so just search through all releases until we find a
89-
# matching file.
90-
for file_ in itertools.chain.from_iterable(metadata['releases'].values()):
91-
if file_['filename'] == filename:
92-
url = urllib.parse.urljoin(config.pypi_url, file_['url'])
93-
break
94-
else:
95-
raise CannotFindFileError(package, filename)
147+
filename_to_url = await config.repo.files_for_package(package)
148+
try:
149+
url = filename_to_url[filename]
150+
except KeyError:
151+
raise CannotFindFileError(package, filename)
96152

97-
await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
153+
await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
98154

155+
async with httpx.AsyncClient() as client:
99156
async with _atomic_file(stored_path) as f:
100157
async with client.stream('GET', url) as resp:
101158
resp.raise_for_status()

0 commit comments

Comments
 (0)