-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_contents.py
167 lines (157 loc) · 7.3 KB
/
crawl_contents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import argparse
import pandas as pd
import os
import time
import resource
from github import Github
from github.GithubException import RateLimitExceededException, UnknownObjectException
from emoji import emoji_count
from pydriller import Repository
from utils import wrap_query, catch_rate_limit, collect, safe_load_repo, get_access_token
@wrap_query
def query_readme_history(row: pd.Series, id_key: str, *args, **kwargs):
"""Goes through commit history of README file.
Args:
row (pd.Series): contains column with repository ID and path to its README file
id_key (str): name of column containing repository ID
Returns:
pd.Series: added columns ['author_date', 'added_headings', 'deleted_headings', 'added_cites']
"""
repo_link = row[id_key]
readme_path = row['readme_path']
if pd.isna(readme_path) or not readme_path.endswith('md'):
return None
repo_readme = Repository('https://github.com/'+repo_link, filepath=readme_path)
history = {k: [] for k in ['author_date', 'added_headings', 'deleted_headings', 'added_cites']}
for commit in repo_readme.traverse_commits():
try:
added_headings = []
deleted_headings = []
added_cites = []
# look for changes to readme file in commit
for f in commit.modified_files:
if f.new_path == readme_path:
for _, line in f.diff_parsed['added']:
if line.startswith('#'):
added_headings.append(line.lstrip('# '))
else:
for indicator in ["DOI:", "doi.", "@article", "@misc"]:
if indicator in line :
added_cites.append(line)
for _, line in f.diff_parsed['deleted']:
if line.startswith('#'):
deleted_headings.append(line.lstrip('# '))
break
if len(added_headings) > 0 or len(deleted_headings) > 0 or len(added_cites) > 0:
history['author_date'].append(commit.author_date)
history['added_headings'].append(added_headings)
history['deleted_headings'].append(deleted_headings)
history['added_cites'].append(added_cites)
except ValueError: # can be raised by git on missing commits somehow
pass
for k, v in history.items():
row[k] = v
return row
@wrap_query
def query_contents(row: pd.Series, id_key: str, g: Github):
"""Gets metadata about interesting files in a repository.
Args:
row (pd.Series): contains column with repository ID
id_key (str): name of column containing repository ID
g (Github): authenticated access to Github API
Returns:
pd.Series: added columns ['license', 'readme_size', 'readme_path', 'readme_emojis', 'contributing_size', 'citation_added', 'contributing_added']
"""
contents = {k: [] for k in ['license', 'readme_size', 'readme_path', 'readme_emojis', 'contributing_size', 'citation_added', 'contributing_added']}
repo = safe_load_repo(g, row[id_key], "query_contents")
if repo is None:
return None
for tries in range(2): # allow retry
try:
try: # LICENSE
license_file = repo.get_license()
contents['license'].append(license_file.license.key)
except UnknownObjectException:
contents['license'].append(None)
try: # README.md
readme = repo.get_readme()
contents['readme_size'].append(readme.size)
readme_content = readme.decoded_content.decode()
contents['readme_emojis'].append(emoji_count(readme_content))
contents['readme_path'].append(readme.path)
except UnknownObjectException:
contents['readme_size'].append(0)
contents['readme_emojis'].append(0)
contents['readme_path'].append(None)
try: # CONTRIBUTING
contents['contributing_size'].append(repo.get_contents("CONTRIBUTING.md").size)
except UnknownObjectException:
contents['contributing_size'].append(0)
except RateLimitExceededException:
if tries == 0:
catch_rate_limit(g)
else:
raise
break # break early if no rate limit problem
repo_citation = Repository('https://github.com/'+row[id_key], filepath="CITATION.cff")
commits_iterator = repo_citation.traverse_commits()
try:
contents['citation_added'].append(next(commits_iterator).author_date)
except StopIteration:
contents['citation_added'].append(None)
repo_contrib_guidelines = Repository('https://github.com/'+row[id_key], filepath="CONTRIBUTING.md")
contrib_guidelines_iterator = repo_contrib_guidelines.traverse_commits()
try:
contents['contributing_added'].append(next(contrib_guidelines_iterator).author_date)
except StopIteration:
contents['contributing_added'].append(None)
for k, v in contents.items():
row[k] = v
return row
def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contents and readme info.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
name (str): name of column containing the identifiers
target_folder (str): path to folder to store CSV data in
verbose (bool): toggles verbose output
"""
repo_links = df[[name]]
repo_links = repo_links.drop_duplicates()
g = Github(get_access_token())
if verbose:
print(g.rate_limiting)
print("Querying contents...")
start = time.time()
collect(g, repo_links, name, query_contents,
[],
os.path.join(target_folder, 'contents.csv'))
if verbose:
end = time.time()
print(f"Done - {end-start:.2f} seconds.")
print("Querying readme history...")
start = time.time()
contents_df = pd.read_csv(os.path.join(target_folder, 'contents.csv'))
collect(g, contents_df[[name, 'readme_path']], name, query_readme_history,
[],
os.path.join(target_folder, 'readme_history.csv'))
if verbose:
end = time.time()
print(f"Done - {end-start:.2f} seconds.")
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)
if __name__ == "__main__":
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (2000000000, hard))
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.datadir, args.verbose)