oracle · brenns10 · Mar 29, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,20 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  commit-hooks:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install pre-commit and mypy
+        run: pip install pre-commit && make dev
+      - name: Run pre-commit hooks
+        run: pre-commit run --all-files --show-diff-on-failure
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,51 @@
+name: Publish
+
+on:
+  push:
+    branches:
+      - main
+  schedule:
+    # Schedule: 5:30PM Monday-Friday UTC, which is roughly
+    # 10:30 AM Pacific (give or take some DST) on weekdays.
+    - cron: '30 17 * * MON-FRI'
+
+permissions:
+  contents: write
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04  # don't use "-latest", we can manually update
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      GNUPGHOME: /tmp  # avoid "gpg: Fatal: can't create directory '/home/runner/.gnupg': File exists"
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install gzip bzip2 xz-utils zstd tar rpm cpio dpkg make
+          make venv
+      - name: Checkout gh-pages and setup git
+        run: |
+          git fetch origin gh-pages --depth=1
+          git worktree add ../gh-pages gh-pages
+          git config --global user.name 'Github Actions'
+          git config --global user.email 'noreply@example.com'
+      - name: Fetch updates and build page
+        run: |
+          venv/bin/python -m kconfigs.main config.ini \
+              --state ../gh-pages/state.json \
+              --output-dir ../gh-pages/out
+          venv/bin/python -m kconfigs.analyzer config.ini \
+              --input-dir ../gh-pages/out \
+              --output-file ../gh-pages/docs/summary.json
+          cp index.html tux-sm.png ../gh-pages/docs/
+      - name: Push update
+        run: |
+          cd ../gh-pages
+          git add .
+          git commit -m "Automatic update"
+          git push origin gh-pages
diff --git a/kconfigs/analyzer.py b/kconfigs/analyzer.py
@@ -57,7 +57,9 @@ def main() -> None:
 
     cfg = ConfigParser()
     cfg.read(args.config)
-    distros = [DistroConfig(**dict(cfg[sec])) for sec in cfg.sections()]
+    distros = [
+        DistroConfig(**dict(cfg[sec])) for sec in cfg.sections()  # type: ignore
+    ]
 
     kconfigs = {}
     kconfig_keys: set[str] = set()

diff --git a/kconfigs/main.py b/kconfigs/main.py
@@ -4,6 +4,7 @@
 import asyncio
 import configparser
 import json
+import multiprocessing
 import posixpath
 import shutil
 from pathlib import Path
@@ -16,6 +17,11 @@
 from kconfigs.util import download_manager
 
 
+# Extraction is CPU-bound, and it also consumes quite a bit of disk space.
+# Limit the number of CPUs which can do extraction in parallel.
+extract_sem = asyncio.Semaphore(multiprocessing.cpu_count() + 1)
+
+
 class FetcherFactory:
     def __init__(self, state: dict[str, Any], workdir: Path):
         self.registry: dict[tuple[str, str], Fetcher] = {}
@@ -63,23 +69,27 @@ async def run_for_distro(
         workdir.mkdir(parents=True)
         latest_url, maybe_csum = await fetcher.latest_version_url(d.package)
         if latest_url != previous_url:
-            name = posixpath.basename(latest_url)
-            file = workdir / name
-            await download_file(latest_url, file, checksum=maybe_csum)
-
-            extractor = Extractor.get(d.extractor)
-
-            maybe_sig = await fetcher.signature_url(d.package)
-            if maybe_sig:
-                signame = posixpath.basename(maybe_sig)
-                sigfile = workdir / signame
-                await download_file(maybe_sig, sigfile)
-                await extractor.verify_signature(file, sigfile, d)
-
-            print(f"Extract config of {d.unique_name}")
-            await extractor.extract_kconfig(file, out, d)
+            async with extract_sem:
+                name = posixpath.basename(latest_url)
+                file = workdir / name
+                await download_file(latest_url, file, checksum=maybe_csum)
+
+                extractor = Extractor.get(d.extractor)
+
+                maybe_sig = await fetcher.signature_url(d.package)
+                if maybe_sig:
+                    signame = posixpath.basename(maybe_sig)
+                    sigfile = workdir / signame
+                    await download_file(maybe_sig, sigfile)
+                    await extractor.verify_signature(file, sigfile, d)
+
+                print(f"Extract config of {d.unique_name}")
+                await extractor.extract_kconfig(file, out, d)
     else:
         latest_url = previous_url
+    if workdir.exists():
+        # Clear the distro's work directory to conserve space
+        shutil.rmtree(workdir)
     return d, {"latest_url": latest_url}
 
 

diff --git a/kconfigs/util.py b/kconfigs/util.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, Oracle and/or its affiliates.
 # Licensed under the terms of the GNU General Public License.
+import asyncio
 import hashlib
 import io
 import posixpath
@@ -26,6 +27,8 @@
 
 
 class DownloadManager:
+    RETRIES = 3
+
     def __init__(self, max_downloads: int = 8):
         self.session = ClientSession(raise_for_status=True)
         self.sem = Semaphore(max_downloads)
@@ -49,19 +52,38 @@ async def download_file(
             # Prevents duplicate work during development
             print(f"Skip download {file}")
             return
-        async with self.sem:
-            try:
-                print(f"Download {url} to {file}")
-                async with self.session.get(url) as resp, aiofiles.open(
-                    file, "wb"
-                ) as out:
-                    async for chunk in resp.content.iter_chunked(4096):
-                        if checksum:
-                            h.update(chunk)
-                        await out.write(chunk)
-            except BaseException:
-                file.unlink(missing_ok=True)
-                raise
+        errors = []
+        for i in range(self.RETRIES):
+            async with self.sem:
+                try:
+                    print(
+                        f"Download {url} to {file} [try {i+1}/{self.RETRIES}]"
+                    )
+                    async with self.session.get(url) as resp, aiofiles.open(
+                        file, "wb"
+                    ) as out:
+                        async for chunk in resp.content.iter_chunked(4096):
+                            if checksum:
+                                h.update(chunk)
+                            await out.write(chunk)
+                    break
+                except ClientResponseError as err:
+                    file.unlink(missing_ok=True)
+                    if err.status == 404:
+                        # retrying won't help, raise
+                        raise
+                    # otherwise, wait a second and retry
+                    errors.append(err)
+                except BaseException:
+                    file.unlink(missing_ok=True)
+                    raise
+            await asyncio.sleep(1)
+        else:
+            # loop terminated after retries,
+            raise Exception(
+                f"Failed to download {url} after {self.RETRIES} retries: "
+                f"{errors}"
+            )
         if checksum:
             digest = h.hexdigest()
             if digest != checksum[1]:
@@ -76,15 +98,30 @@ async def download_file(
     async def download_file_mem(
         self, url: str, checksum: tuple[str, str] | None = None
     ) -> bytes:
-        out = io.BytesIO()
         if checksum:
             h = hashlib.new(checksum[0])
-        async with self.sem, self.session.get(url) as resp:
-            print(f"Download {url} to mem")
-            async for chunk in resp.content.iter_chunked(4096):
-                if checksum:
-                    h.update(chunk)
-                out.write(chunk)
+        errors = []
+        for i in range(self.RETRIES):
+            out = io.BytesIO()
+            try:
+                async with self.sem, self.session.get(url) as resp:
+                    print(f"Download {url} to mem [try {i+1}/{self.RETRIES}]")
+                    async for chunk in resp.content.iter_chunked(4096):
+                        if checksum:
+                            h.update(chunk)
+                        out.write(chunk)
+                break
+            except ClientResponseError as err:
+                if err.status == 404:
+                    raise
+                errors.append(err)
+            await asyncio.sleep(1)
+        else:
+            # loop terminated after retries,
+            raise Exception(
+                f"Failed to download {url} after {self.RETRIES} retries: "
+                f"{errors}"
+            )
         if checksum:
             digest = h.hexdigest()
             if digest != checksum[1]: