25
25
# logger.setLevel(logging.DEBUG)
26
26
# logger.addHandler(logging.StreamHandler())
27
27
28
+ MAX_WORKERS = 16
29
+ GOOGLE_ASSURED_OSS_PACKAGES = set ()
28
30
29
31
@contextlib .contextmanager
30
32
def locked_db ():
@@ -132,7 +134,7 @@ def get_metadata_by_install(package, resp):
132
134
133
135
def get_maintainers_from_pypi (package : str ):
134
136
for _ in range (5 ):
135
- resp = http .request ("GET" , f"https://pypi.org/project/{ package } " )
137
+ resp = http .request ("GET" , f"https://pypi.org/project/{ package } / " )
136
138
if resp .status == 404 :
137
139
return set ()
138
140
elif resp .status != 200 :
@@ -200,7 +202,7 @@ def get_project_urls(info: dict) -> list[tuple[str, str, str]]:
200
202
201
203
202
204
def update_data_for_package (package : str ) -> None :
203
- global downloads , db_lock
205
+ global downloads , db_lock , GOOGLE_ASSURED_OSS_PACKAGES
204
206
205
207
resp = http .request ("GET" , f"https://pypi.org/pypi/{ package } /json" )
206
208
@@ -290,8 +292,8 @@ def update_data_for_package(package: str) -> None:
290
292
db .execute (
291
293
"""
292
294
INSERT OR IGNORE INTO packages (
293
- name, version, requires_python, has_binary_wheel, uploaded_at, downloads, scorecard_overall
294
- ) VALUES (?, ?, ?, ?, ?, ?, ?);
295
+ name, version, requires_python, has_binary_wheel, uploaded_at, downloads, scorecard_overall, in_google_assured_oss
296
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ? );
295
297
""" ,
296
298
(
297
299
package ,
@@ -301,6 +303,7 @@ def update_data_for_package(package: str) -> None:
301
303
uploaded_at ,
302
304
package_downloads ,
303
305
scorecard_overall ,
306
+ package .lower () in GOOGLE_ASSURED_OSS_PACKAGES
304
307
),
305
308
)
306
309
@@ -420,9 +423,23 @@ def update_data_from_pypi():
420
423
pass
421
424
422
425
426
+ def get_google_assured_oss_packages (http : urllib3 .PoolManager ) -> set [str ]:
427
+ resp = http .request ("GET" , "https://cloud.google.com/assured-open-source-software/docs/supported-packages" )
428
+ data = resp .data .decode ("utf-8" )
429
+
430
+ # Start after the Python heading, then look for first list.
431
+ data = data [data .find ("<h2 id=\" python\" " ):]
432
+ start = data .find ("<ul>" )
433
+ end = data .find ("</ul>" )
434
+ return {x .lower () for x in re .findall (r"<li>([^<]+)</li>" , data [start :end ])}
435
+
436
+
423
437
if __name__ == "__main__" :
424
438
base_dir = os .path .dirname ((os .path .abspath (__file__ )))
425
439
http = urllib3 .PoolManager (
440
+ block = True ,
441
+ strict = True ,
442
+ maxsize = MAX_WORKERS ,
426
443
headers = urllib3 .util .make_headers (
427
444
keep_alive = True ,
428
445
accept_encoding = True ,
@@ -434,6 +451,8 @@ def update_data_from_pypi():
434
451
)
435
452
wheel_re = re .compile (r"-([^\-]+-[^\-]+-[^\-]+)\.whl$" )
436
453
454
+ GOOGLE_ASSURED_OSS_PACKAGES = get_google_assured_oss_packages (http )
455
+
437
456
tmp_dir = tempfile .mkdtemp ()
438
457
os .system (f"virtualenv { tmp_dir } /venv > /dev/null" )
439
458
venv_python = os .path .join (tmp_dir , "venv/bin/python" )
@@ -460,6 +479,7 @@ def update_data_from_pypi():
460
479
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
461
480
downloads INTEGER,
462
481
scorecard_overall FLOAT,
482
+ in_google_assured_oss BOOLEAN,
463
483
PRIMARY KEY (name)
464
484
);
465
485
"""
@@ -536,7 +556,7 @@ def update_data_from_pypi():
536
556
)
537
557
_DB .commit ()
538
558
db_lock = threading .Lock ()
539
- pool = ThreadPoolExecutor ()
559
+ pool = ThreadPoolExecutor (max_workers = MAX_WORKERS )
540
560
541
561
packages = get_all_package_names ()
542
562
0 commit comments