-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_metadata.py
80 lines (72 loc) · 2.98 KB
/
crawl_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import argparse
import pandas as pd
import os
import time
from github import Github
from github.GithubException import RateLimitExceededException
from utils import wrap_query, catch_rate_limit, collect, safe_load_repo, get_access_token
@wrap_query
def query_metadata(row: pd.Series, id_key: str, g: Github):
"""Gets archival status, creation date, wiki existance and page existance for a repository.
Args:
row (pd.Series): contains column with repository ID
id_key (str): name of column containing repository ID
g (Github): authenticated access to Github API
Returns:
pd.Series: added columns ['archived', 'created_at', 'has_wiki', 'has_pages']
"""
data = {k: [] for k in ['archived', 'created_at', 'has_wiki', 'has_pages']}
repo = safe_load_repo(g, row[id_key], "query_metadata")
if repo is None:
return None
for tries in range(2):
try:
data['archived'].append(repo.archived)
data['created_at'].append(repo.created_at)
data['has_wiki'].append(repo.has_wiki)
data['has_pages'].append(repo.has_pages)
except RateLimitExceededException:
if tries == 0:
catch_rate_limit(g)
else:
raise
break
for k, v in data.items():
row[k] = v
return row
def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve metadata and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
name (str): name of column containing the identifiers
target_folder (str): path to folder to store CSV data in
verbose (bool): toggles verbose output
"""
repo_links = df[[name]]
repo_links = repo_links.drop_duplicates()
g = Github(get_access_token())
if verbose:
print(g.rate_limiting)
print("Querying metadata...")
start = time.time()
collect(g, repo_links, name, query_metadata,
[],
os.path.join(target_folder, 'metadata.csv'))
if verbose:
end = time.time()
print(f"Done - {end-start:.2f} seconds.")
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/github/", help="directory to write GitHub data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.datadir, args.verbose)