Skip to content

Commit

Permalink
Merge pull request #340 from HebaruSan/fix/download-counts-ia-item-fmt
Browse files Browse the repository at this point in the history
Handle weird version strings in download counter for archive.org
  • Loading branch information
HebaruSan authored Sep 15, 2024
2 parents 54940df + 6eb8604 commit da7fe3b
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 19 deletions.
5 changes: 1 addition & 4 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,8 @@ def empty(self) -> bool:
def full(self) -> bool:
return len(self.ids) >= self.MODULES_PER_REQUEST

def _get_ia_ident(self, ckan: Ckan) -> str:
return f'{ckan.identifier}-{ckan.version.string.replace(":", "-")}'

def add(self, ckan: Ckan) -> None:
self.ids[ckan.identifier] = self._get_ia_ident(ckan)
self.ids[ckan.identifier] = ckan.mirror_item()

def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
if counts is None:
Expand Down
15 changes: 15 additions & 0 deletions netkan/netkan/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ class Ckan:

EPOCH_VERSION_REGEXP = re.compile('^[0-9]+:')

BUCKET_EXCLUDE_PATTERN = re.compile(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9._-]')

REDISTRIBUTABLE_LICENSES = {
"public-domain",
"Apache", "Apache-1.0", "Apache-2.0",
Expand Down Expand Up @@ -443,6 +445,19 @@ def mirror_download(self, with_epoch: bool = True) -> Optional[str]:
return f'https://archive.org/download/{self.identifier}-{self._format_version(with_epoch)}/{filename}'
return None

def mirror_item(self, with_epoch: bool = True) -> str:
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}')

# InternetArchive says:
# Bucket names should be valid archive identifiers;
# try someting matching this regular expression:
# ^[a-zA-Z0-9][a-zA-Z0-9_.-]{4,100}$
# (We enforce everything except the minimum of 4 characters)
@classmethod
def _ia_bucket_sanitize(cls, s: str) -> str:
return cls.BUCKET_EXCLUDE_PATTERN.sub('', s)[:100]

def _format_version(self, with_epoch: bool) -> Optional[str]:
if self.version:
if with_epoch:
Expand Down
15 changes: 0 additions & 15 deletions netkan/netkan/mirrorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ class CkanMirror(Ckan):
DESCRIPTION_TEMPLATE = Template(
legacy_read_text('netkan', 'mirror_description_template.jinja2'))

BUCKET_EXCLUDE_PATTERN = re.compile(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9._-]')

LICENSE_URLS = {
"Apache" : 'http://www.apache.org/licenses/LICENSE-1.0',
"Apache-1.0" : 'http://www.apache.org/licenses/LICENSE-1.0',
Expand Down Expand Up @@ -133,26 +131,13 @@ def license_urls(self) -> List[str]:
return [self.LICENSE_URLS[lic]
for lic in self.licenses() if lic in self.LICENSE_URLS]

def mirror_item(self, with_epoch: bool = True) -> str:
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}')

def mirror_source_filename(self, with_epoch: bool = True) -> str:
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}.source.zip')

def mirror_title(self, with_epoch: bool = True) -> str:
return f'{self.name} - {self._format_version(with_epoch)}'

# InternetArchive says:
# Bucket names should be valid archive identifiers;
# try someting matching this regular expression:
# ^[a-zA-Z0-9][a-zA-Z0-9_.-]{4,100}$
# (We enforce everything except the minimum of 4 characters)
@classmethod
def _ia_bucket_sanitize(cls, s: str) -> str:
return cls.BUCKET_EXCLUDE_PATTERN.sub('', s)[:100]

@property
def item_metadata(self) -> Dict[str, Any]:
lic_urls = self.license_urls()
Expand Down

0 comments on commit da7fe3b

Please sign in to comment.