Skip to content

Commit 87137d8

Browse files
authored
Merge pull request #779 from gerrod3/python-metadata-fix-script
Fix requires_python metadata + add repair metadata command
2 parents af35d18 + ac33f4b commit 87137d8

File tree

10 files changed

+261
-35
lines changed

10 files changed

+261
-35
lines changed

CHANGES/773.bugfix

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fixed `requires_python` field not being properly set on package upload.
2+
3+
Run the new `pulpcore-manager repair-python-metadata` command with repositories containing affected
4+
packages to repair their metadata.

pulp_python/app/management/__init__.py

Whitespace-only changes.

pulp_python/app/management/commands/__init__.py

Whitespace-only changes.
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import re
2+
import os
3+
from django.core.management import BaseCommand, CommandError
4+
from gettext import gettext as _
5+
6+
from django.conf import settings
7+
8+
from pulpcore.plugin.util import extract_pk
9+
from pulp_python.app.models import PythonPackageContent, PythonRepository
10+
from pulp_python.app.utils import artifact_to_python_content_data
11+
12+
13+
def repair_metadata(content):
14+
"""
15+
Repairs the metadata for the passed in content queryset.
16+
:param content: The PythonPackageContent queryset.
17+
Return: number of content units that were repaired
18+
"""
19+
# TODO: Add on_demand content repair?
20+
os.chdir(settings.WORKING_DIRECTORY)
21+
content = content.select_related("pulp_domain")
22+
immediate_content = content.filter(contentartifact__artifact__isnull=False)
23+
batch = []
24+
set_of_update_fields = set()
25+
total_repaired = 0
26+
for package in immediate_content.prefetch_related('_artifacts').iterator(chunk_size=1000):
27+
new_data = artifact_to_python_content_data(
28+
package.filename, package._artifacts.get(), package.pulp_domain
29+
)
30+
changed = False
31+
for field, value in new_data.items():
32+
if getattr(package, field) != value:
33+
setattr(package, field, value)
34+
set_of_update_fields.add(field)
35+
changed = True
36+
if changed:
37+
batch.append(package)
38+
if len(batch) == 1000:
39+
total_repaired += len(batch)
40+
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
41+
batch = []
42+
set_of_update_fields.clear()
43+
44+
if len(batch) > 0:
45+
total_repaired += len(batch)
46+
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
47+
48+
return total_repaired
49+
50+
51+
def href_prn_list_handler(value):
52+
"""Common list parsing for a string of hrefs/prns."""
53+
r = re.compile(
54+
rf"""
55+
(?:{settings.API_ROOT}(?:[-_a-zA-Z0-9]+/)?api/v3/repositories/python/python/[-a-f0-9]+/)
56+
|(?:prn:python\.pythonrepository:[-a-f0-9]+)
57+
""",
58+
re.VERBOSE
59+
)
60+
values = []
61+
for v in value.split(","):
62+
if v:
63+
if match := r.match(v.strip()):
64+
values.append(match.group(0))
65+
else:
66+
raise CommandError(f"Invalid href/prn: {v}")
67+
return values
68+
69+
70+
class Command(BaseCommand):
71+
"""
72+
Management command to repair metadata of PythonPackageContent.
73+
"""
74+
75+
help = _("Repair the metadata of PythonPackageContent stored in PythonRepositories")
76+
77+
def add_arguments(self, parser):
78+
"""Set up arguments."""
79+
parser.add_argument(
80+
"--repositories",
81+
type=href_prn_list_handler,
82+
required=False,
83+
help=_(
84+
"List of PythonRepository hrefs/prns whose content's metadata will be repaired. "
85+
"Leave blank to include all repositories in all domains. Mutually exclusive "
86+
"with domain."
87+
),
88+
)
89+
parser.add_argument(
90+
"--domain",
91+
default=None,
92+
required=False,
93+
help=_(
94+
"The pulp domain to gather the repositories from if specified. Mutually"
95+
" exclusive with repositories."
96+
),
97+
)
98+
99+
def handle(self, *args, **options):
100+
"""Implement the command."""
101+
domain = options.get("domain")
102+
repository_hrefs = options.get("repositories")
103+
if domain and repository_hrefs:
104+
raise CommandError(_("--domain and --repositories are mutually exclusive"))
105+
106+
repositories = PythonRepository.objects.all()
107+
if repository_hrefs:
108+
repos_ids = [extract_pk(r) for r in repository_hrefs]
109+
repositories = repositories.filter(pk__in=repos_ids)
110+
elif domain:
111+
repositories = repositories.filter(pulp_domain__name=domain)
112+
113+
content_set = set()
114+
for repository in repositories:
115+
content_set.update(repository.latest_version().content.values_list("pk", flat=True))
116+
content = PythonPackageContent.objects.filter(pk__in=content_set)
117+
num_repaired = repair_metadata(content)
118+
print(f"{len(content_set)} packages processed, {num_repaired} package metadata repaired.")

pulp_python/app/models.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717

1818
from pathlib import PurePath
1919
from .utils import (
20+
artifact_to_python_content_data,
2021
canonicalize_name,
21-
get_project_metadata_from_artifact,
22-
parse_project_metadata,
2322
python_content_to_json,
2423
PYPI_LAST_SERIAL,
2524
PYPI_SERIAL_CONSTANT,
@@ -189,14 +188,7 @@ class PythonPackageContent(Content):
189188
def init_from_artifact_and_relative_path(artifact, relative_path):
190189
"""Used when downloading package from pull-through cache."""
191190
path = PurePath(relative_path)
192-
metadata = get_project_metadata_from_artifact(path.name, artifact)
193-
data = parse_project_metadata(vars(metadata))
194-
data["packagetype"] = metadata.packagetype
195-
data["version"] = metadata.version
196-
data["filename"] = path.name
197-
data["sha256"] = artifact.sha256
198-
data["pulp_domain_id"] = artifact.pulp_domain_id
199-
data["_pulp_domain_id"] = artifact.pulp_domain_id
191+
data = artifact_to_python_content_data(path.name, artifact, domain=get_domain())
200192
return PythonPackageContent(**data)
201193

202194
def __str__(self):

pulp_python/app/serializers.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pulp_python.app import models as python_models
1111
from pulp_python.app import fields
12-
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
12+
from pulp_python.app.utils import artifact_to_python_content_data
1313

1414

1515
class PythonRepositorySerializer(core_serializers.RepositorySerializer):
@@ -217,7 +217,7 @@ def deferred_validate(self, data):
217217

218218
artifact = data["artifact"]
219219
try:
220-
metadata = get_project_metadata_from_artifact(filename, artifact)
220+
_data = artifact_to_python_content_data(filename, artifact, domain=get_domain())
221221
except ValueError:
222222
raise serializers.ValidationError(_(
223223
"Extension on {} is not a valid python extension "
@@ -231,14 +231,6 @@ def deferred_validate(self, data):
231231
)}
232232
)
233233

234-
_data = parse_project_metadata(vars(metadata))
235-
_data['packagetype'] = metadata.packagetype
236-
_data['version'] = metadata.version
237-
_data['filename'] = filename
238-
_data['sha256'] = artifact.sha256
239-
data["pulp_domain_id"] = artifact.pulp_domain_id
240-
data["_pulp_domain_id"] = artifact.pulp_domain_id
241-
242234
data.update(_data)
243235

244236
return data

pulp_python/app/tasks/upload.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pulpcore.plugin.util import get_domain
88

99
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
10+
from pulp_python.app.utils import artifact_to_python_content_data
1111

1212

1313
def upload(artifact_sha256, filename, repository_pk=None):
@@ -76,15 +76,7 @@ def create_content(artifact_sha256, filename, domain):
7676
queryset of the new created content
7777
"""
7878
artifact = Artifact.objects.get(sha256=artifact_sha256, pulp_domain=domain)
79-
metadata = get_project_metadata_from_artifact(filename, artifact)
80-
81-
data = parse_project_metadata(vars(metadata))
82-
data['packagetype'] = metadata.packagetype
83-
data['version'] = metadata.version
84-
data['filename'] = filename
85-
data['sha256'] = artifact.sha256
86-
data['pulp_domain'] = domain
87-
data['_pulp_domain'] = domain
79+
data = artifact_to_python_content_data(filename, artifact, domain)
8880

8981
@transaction.atomic()
9082
def create():

pulp_python/app/utils.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pkginfo
2+
import re
23
import shutil
34
import tempfile
45
import json
@@ -51,6 +52,20 @@
5152
".zip": "sdist",
5253
}
5354

55+
DIST_REGEXES = {
56+
# regex from https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569
57+
".whl": re.compile(
58+
r"""^(?P<name>.+?)-(?P<version>.*?)
59+
((-(?P<build>\d[^-]*?))?-(?P<pyver>.+?)-(?P<abi>.+?)-(?P<plat>.+?)
60+
\.whl|\.dist-info)$""",
61+
re.VERBOSE
62+
),
63+
# regex based on https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html#filename-embedded-metadata # noqa: E501
64+
".egg": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)(-(?P<pyver>.+?(-(?P<plat>.+?))?))?\.egg|\.egg-info$"), # noqa: E501
65+
# regex based on https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292 # noqa: E501
66+
".exe": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)\.(?P<plat>.+?)(-(?P<pyver>.+?))?\.exe$"),
67+
}
68+
5469
DIST_TYPES = {
5570
"bdist_wheel": pkginfo.Wheel,
5671
"bdist_wininst": pkginfo.Distribution,
@@ -72,6 +87,8 @@ def parse_project_metadata(project):
7287
"""
7388
package = {}
7489
package['name'] = project.get('name') or ""
90+
package['version'] = project.get('version') or ""
91+
package['packagetype'] = project.get('packagetype') or ""
7592
package['metadata_version'] = project.get('metadata_version') or ""
7693
package['summary'] = project.get('summary') or ""
7794
package['description'] = project.get('description') or ""
@@ -86,13 +103,15 @@ def parse_project_metadata(project):
86103
package['project_url'] = project.get('project_url') or ""
87104
package['platform'] = project.get('platform') or ""
88105
package['supported_platform'] = project.get('supported_platform') or ""
106+
package['requires_python'] = project.get('requires_python') or ""
89107
package['requires_dist'] = json.dumps(project.get('requires_dist', []))
90108
package['provides_dist'] = json.dumps(project.get('provides_dist', []))
91109
package['obsoletes_dist'] = json.dumps(project.get('obsoletes_dist', []))
92110
package['requires_external'] = json.dumps(project.get('requires_external', []))
93111
package['classifiers'] = json.dumps(project.get('classifiers', []))
94112
package['project_urls'] = json.dumps(project.get('project_urls', {}))
95113
package['description_content_type'] = project.get('description_content_type') or ""
114+
package['python_version'] = project.get('python_version') or ""
96115

97116
return package
98117

@@ -113,17 +132,15 @@ def parse_metadata(project, version, distribution):
113132
dictionary: of useful python metadata
114133
115134
"""
116-
package = {}
135+
package = parse_project_metadata(project)
117136

118137
package['filename'] = distribution.get('filename') or ""
119138
package['packagetype'] = distribution.get('packagetype') or ""
120139
package['version'] = version
121140
package['url'] = distribution.get('url') or ""
122141
package['sha256'] = distribution.get('digests', {}).get('sha256') or ""
123-
package['python_version'] = distribution.get('python_version') or ""
124-
package['requires_python'] = distribution.get('requires_python') or ""
125-
126-
package.update(parse_project_metadata(project))
142+
package['python_version'] = distribution.get('python_version') or package.get('python_version')
143+
package['requires_python'] = distribution.get('requires_python') or package.get('requires_python') # noqa: E501
127144

128145
return package
129146

@@ -147,9 +164,30 @@ def get_project_metadata_from_artifact(filename, artifact):
147164
temp_file.flush()
148165
metadata = DIST_TYPES[packagetype](temp_file.name)
149166
metadata.packagetype = packagetype
167+
if packagetype == "sdist":
168+
metadata.python_version = "source"
169+
else:
170+
pyver = ""
171+
regex = DIST_REGEXES[extensions[pkg_type_index]]
172+
if bdist_name := regex.match(filename):
173+
pyver = bdist_name.group("pyver") or ""
174+
metadata.python_version = pyver
150175
return metadata
151176

152177

178+
def artifact_to_python_content_data(filename, artifact, domain=None):
179+
"""
180+
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
181+
"""
182+
metadata = get_project_metadata_from_artifact(filename, artifact)
183+
data = parse_project_metadata(vars(metadata))
184+
data['sha256'] = artifact.sha256
185+
data['filename'] = filename
186+
data['pulp_domain'] = domain or artifact.pulp_domain
187+
data['_pulp_domain'] = data['pulp_domain']
188+
return data
189+
190+
153191
def python_content_to_json(base_path, content_query, version=None, domain=None):
154192
"""
155193
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_crud_content_unit.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,15 @@ def test_upload_metadata_23_spec(python_content_factory):
122122
content = python_content_factory(filename, url=package.url)
123123
assert content.metadata_version == "2.3"
124124
break
125+
126+
127+
@pytest.mark.parallel
128+
def test_upload_requires_python(python_content_factory):
129+
filename = "pip-24.3.1-py3-none-any.whl"
130+
with PyPISimple() as client:
131+
page = client.get_project_page("pip")
132+
for package in page.packages:
133+
if package.filename == filename:
134+
content = python_content_factory(filename, url=package.url)
135+
assert content.requires_python == ">=3.8"
136+
break

0 commit comments

Comments
 (0)