Skip to content

Commit 3ca6e79

Browse files
authored
Improvement: Speed up public organization enumeration (#49)
1 parent a53e61a commit 3ca6e79

File tree

12 files changed

+280
-40
lines changed

12 files changed

+280
-40
lines changed

gato/enumerate/enumerate.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22

33
from gato.github import Api
4+
from gato.github import GqlQueries
45
from gato.models import Repository, Organization
56
from gato.cli import Output
67
from gato.enumerate.repository import RepositoryEnum
@@ -173,12 +174,22 @@ def enumerate_organization(self, org: str):
173174
f"the {organization.name} organization!"
174175
)
175176

177+
Output.info(f"Querying and caching workflow YAML files!")
178+
wf_queries = GqlQueries.get_workflow_ymls(enum_list)
179+
180+
for wf_query in wf_queries:
181+
result = self.org_e.api.call_post('/graphql', wf_query)
182+
# Sometimes we don't get a 200, fall back in this case.
183+
if result.status_code == 200:
184+
self.repo_e.construct_workflow_cache(result.json()['data']['nodes'])
185+
else:
186+
Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!")
176187
for repo in enum_list:
177-
178188
Output.tabbed(
179189
f"Enumerating: {Output.bright(repo.name)}!"
180190
)
181-
self.repo_e.enumerate_repository(repo)
191+
192+
self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 100)
182193
self.repo_e.enumerate_repository_secrets(repo)
183194

184195
Recommender.print_repo_secrets(

gato/enumerate/recommender.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def print_repo_runner_info(repository: Repository):
140140
Output.result(
141141
f"The repository contains a workflow: "
142142
f"{Output.bright(repository.sh_workflow_names[0])} that "
143-
"executes on self-hosted runners!"
143+
"might execute on self-hosted runners!"
144144
)
145145

146146
if repository.accessible_runners:
@@ -157,6 +157,11 @@ def print_repo_runner_info(repository: Repository):
157157
f"{Output.bright(repository.accessible_runners[0].machine_name)}"
158158
)
159159

160+
for runner in repository.accessible_runners:
161+
if runner.non_ephemeral:
162+
Output.owned("The repository contains a non-ephemeral self-hosted runner!")
163+
break
164+
160165
if repository.runners:
161166
Output.result(
162167
f"The repository has {len(repository.runners)} repo-level"

gato/enumerate/repository.py

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def __init__(self, api: Api, skip_log: bool, output_yaml):
2121
api (Api): GitHub API wraper object.
2222
"""
2323
self.api = api
24+
self.workflow_cache = {}
2425
self.skip_log = skip_log
2526
self.output_yaml = output_yaml
2627

@@ -40,11 +41,12 @@ def __perform_runlog_enumeration(self, repository: Repository):
4041
)
4142

4243
if wf_runs:
43-
runner = Runner(
44-
wf_runs[0]['runner_name'], wf_runs[0]['machine_name']
45-
)
44+
for wf_run in wf_runs:
45+
runner = Runner(
46+
wf_run['runner_name'], wf_run['machine_name'], non_ephemeral=wf_run['non_ephemeral']
47+
)
4648

47-
repository.add_accessible_runner(runner)
49+
repository.add_accessible_runner(runner)
4850
runner_detected = True
4951

5052
return runner_detected
@@ -60,12 +62,15 @@ def __perform_yml_enumeration(self, repository: Repository):
6062
list: List of workflows that execute on sh runner, empty otherwise.
6163
"""
6264
runner_wfs = []
63-
ymls = self.api.retrieve_workflow_ymls(repository.name)
65+
66+
if repository.name in self.workflow_cache:
67+
ymls = self.workflow_cache[repository.name]
68+
else:
69+
ymls = self.api.retrieve_workflow_ymls(repository.name)
6470

6571
for (wf, yml) in ymls:
6672
try:
6773
parsed_yml = WorkflowParser(yml, repository.name, wf)
68-
6974
self_hosted_jobs = parsed_yml.self_hosted()
7075

7176
if self_hosted_jobs:
@@ -79,12 +84,13 @@ def __perform_yml_enumeration(self, repository: Repository):
7984
# At this point we only know the extension, so handle and
8085
# ignore malformed yml files.
8186
except Exception as parse_error:
82-
print(parse_error)
87+
88+
print(f"{wf}: {str(parse_error)}")
8389
logger.warning("Attmpted to parse invalid yaml!")
8490

8591
return runner_wfs
8692

87-
def enumerate_repository(self, repository: Repository):
93+
def enumerate_repository(self, repository: Repository, large_org_enum=False):
8894
"""Enumerate a repository, and check everything relevant to
8995
self-hosted runner abuse that that the user has permissions to check.
9096
@@ -119,15 +125,25 @@ def enumerate_repository(self, repository: Repository):
119125

120126
repository.set_runners(repo_runners)
121127

122-
if not self.skip_log and self.__perform_runlog_enumeration(repository):
123-
runner_detected = True
124-
125128
workflows = self.__perform_yml_enumeration(repository)
126129

127130
if len(workflows) > 0:
128131
repository.add_self_hosted_workflows(workflows)
129132
runner_detected = True
130133

134+
if not self.skip_log:
135+
# If we are enumerating an organization, only enumerate runlogs if
136+
# the workflow suggests a sh_runner.
137+
if large_org_enum and runner_detected:
138+
self.__perform_runlog_enumeration(repository)
139+
140+
# If we are doing internal enum, get the logs, because coverage is
141+
# more important here and it's ok if it takes time.
142+
elif not repository.is_public() and self.__perform_runlog_enumeration(repository):
143+
runner_detected = True
144+
else:
145+
runner_detected = self.__perform_runlog_enumeration(repository)
146+
131147
if runner_detected:
132148
# Only display permissions (beyond having none) if runner is
133149
# detected.
@@ -158,3 +174,28 @@ def enumerate_repository_secrets(
158174

159175
if org_secrets:
160176
repository.set_accessible_org_secrets(org_secrets)
177+
178+
def construct_workflow_cache(self, yml_results):
179+
"""Creates a cache of workflow yml files retrieved from graphQL. Since
180+
graphql and REST do not have parity, we still need to use rest for most
181+
enumeration calls. This method saves off all yml files, so during org
182+
level enumeration if we perform yml enumeration the cached file is used
183+
instead of making github REST requests.
184+
185+
Args:
186+
yml_results (list): List of results from individual GraphQL queries
187+
(100 nodes at atime).)
188+
"""
189+
for result in yml_results:
190+
owner = result['nameWithOwner']
191+
192+
self.workflow_cache[owner] = list()
193+
194+
if not result['object']:
195+
continue
196+
197+
for yml_node in result['object']['entries']:
198+
yml_name = yml_node['name']
199+
if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'):
200+
contents = yml_node['object']['text']
201+
self.workflow_cache[owner].append((yml_name, contents))

gato/github/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .api import Api
2+
from .gql_queries import GqlQueries
23
from .search import Search

gato/github/api.py

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
import zipfile
77
import re
88
import io
9+
import json
910

1011
from gato.cli import Output
11-
from datetime import datetime, timezone
12+
from datetime import datetime, timezone, timedelta
1213

1314
logger = logging.getLogger(__name__)
1415

@@ -19,8 +20,9 @@ class Api():
1920
rate limiting or network issues.
2021
"""
2122

22-
RUNNER_RE = re.compile(r'Runner name: \'([\w+-]+)\'')
23-
MACHINE_RE = re.compile(r'Machine name: \'([\w+-]+)\'')
23+
RUNNER_RE = re.compile(r'Runner name: \'([\w+-.]+)\'')
24+
MACHINE_RE = re.compile(r'Machine name: \'([\w+-.]+)\'')
25+
RUN_THRESHOLD = 90
2426

2527
def __init__(self, pat: str, version: str = "2022-11-28",
2628
http_proxy: str = None, socks_proxy: str = None,
@@ -110,12 +112,29 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
110112
Returns:
111113
dict: metadata about the run execution.
112114
"""
113-
with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
115+
log_package = None
116+
non_ephemeral = False
114117

118+
with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
115119
for zipinfo in runres.infolist():
120+
# TODO use a lambda for this messy logic
121+
if "Run actionscheckout" in zipinfo.filename:
122+
with runres.open(zipinfo) as run_setup:
123+
content = run_setup.read().decode()
124+
if "Cleaning the repository" in content:
125+
non_ephemeral = True
126+
127+
if log_package:
128+
log_package['non_ephemeral'] = non_ephemeral
129+
116130
if "Set up job" in zipinfo.filename:
117131
with runres.open(zipinfo) as run_setup:
118132
content = run_setup.read().decode()
133+
if "Image Release: https://github.com/actions/runner-images" in content:
134+
# Larger runners will appear to be self-hosted, but
135+
# they will have the image name. Skip if we see this.
136+
continue
137+
119138
if "Runner name" in content or \
120139
"Machine name" in content:
121140

@@ -132,9 +151,10 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
132151
"runner_name": runner_name,
133152
"machine_name": hostname,
134153
"run_id": run_info["id"],
135-
"run_attempt": run_info["run_attempt"]
154+
"run_attempt": run_info["run_attempt"],
155+
"non_ephemeral": non_ephemeral
136156
}
137-
return log_package
157+
return log_package
138158

139159
def __get_full_runlog(self, log_content: bytes, run_name: str):
140160
"""Gets the full text of the runlog from the zip file by matching the
@@ -601,30 +621,59 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True):
601621
Returns:
602622
list: List of run logs for runs that ran on self-hosted runners.
603623
"""
604-
runs = self.call_get(f'/repos/{repo_name}/actions/runs')
624+
start_date = datetime.now() - timedelta(days = 60)
625+
runs = self.call_get(
626+
f'/repos/{repo_name}/actions/runs', params={
627+
"per_page": "30",
628+
"status":"completed",
629+
"exclude_pull_requests": "true",
630+
"created":f">{start_date.isoformat()}"
631+
}
632+
)
605633

606-
run_logs = []
634+
# This is a dictionary so we can de-duplicate runner IDs based on
635+
# the machine_name:runner_name.
636+
run_logs = {}
637+
names = set()
607638

608639
if runs.status_code == 200:
609640
logger.debug(f'Enumerating runs within {repo_name}')
610641
for run in runs.json()['workflow_runs']:
642+
643+
# We are only interested in runs that actually executed.
644+
if run['conclusion'] != 'success' and \
645+
run['conclusion'] != 'failure':
646+
continue
647+
648+
if short_circuit:
649+
# If we are only looking for the presence of SH runners and
650+
# not trying to determine ephmeral vs not from repeats, then
651+
# we just need to look at each branch + wf combination once.
652+
workflow_key = f"{run['head_branch']}:{run['path']}"
653+
if workflow_key in names:
654+
continue
655+
names.add(workflow_key)
656+
611657
run_log = self.call_get(
612658
f'/repos/{repo_name}/actions/runs/{run["id"]}/'
613659
f'attempts/{run["run_attempt"]}/logs')
614-
615660
if run_log.status_code == 200:
616661
run_log = self.__process_run_log(run_log.content, run)
617662
if run_log:
618-
run_logs.append(run_log)
663+
key = f"{run_log['machine_name']}:{run_log['runner_name']}"
664+
run_logs[key] = run_log
665+
619666
if short_circuit:
620-
return run_logs
667+
return run_logs.values()
668+
elif run_log.status_code == 410:
669+
break
621670
else:
622671
logger.debug(
623672
f"Call to retrieve run logs from {repo_name} run "
624673
f"{run['id']} attempt {run['run_attempt']} returned "
625674
f"{run_log.status_code}!")
626675

627-
return run_logs
676+
return run_logs.values()
628677

629678
def parse_workflow_runs(self, repo_name: str):
630679
"""Returns the number of workflow runs associated with the repository.

gato/github/gql_queries.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from gato.models import Repository
2+
3+
class GqlQueries():
4+
"""Constructs graphql queries for use with the GitHub GraphQL api.
5+
"""
6+
7+
GET_YMLS = """
8+
query RepoFiles($node_ids: [ID!]!) {
9+
nodes(ids: $node_ids) {
10+
... on Repository {
11+
nameWithOwner
12+
object(expression: "HEAD:.github/workflows/") {
13+
... on Tree {
14+
entries {
15+
name
16+
type
17+
mode
18+
object {
19+
... on Blob {
20+
byteSize
21+
text
22+
}
23+
}
24+
}
25+
}
26+
}
27+
}
28+
}
29+
}
30+
"""
31+
32+
@staticmethod
33+
def get_workflow_ymls(repos: list):
34+
"""Retrieve workflow yml files for ea
35+
36+
Args:
37+
repos (List[Repository]): List of repository objects
38+
Returns:
39+
(list): List of JSON post parameters for each graphQL query.
40+
"""
41+
queries = []
42+
43+
for i in range(0, (len(repos) // 100) + 1):
44+
45+
top_len = len(repos) if len(repos) < (100 + i*100) else (100 + i*100)
46+
query = {
47+
"query": GqlQueries.GET_YMLS,
48+
"variables": {
49+
"node_ids": [repo.repo_data['node_id'] for repo in repos[0+100*i:top_len]]
50+
}
51+
}
52+
53+
queries.append(query)
54+
return queries

gato/models/runner.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ def __init__(
1111
machine_name=None,
1212
os=None,
1313
status=None,
14-
labels=[]):
14+
labels=[],
15+
non_ephemeral=False):
1516
"""Constructor for runner wrapper object.
1617
1718
Args:
@@ -27,6 +28,7 @@ def __init__(
2728
self.os = os
2829
self.status = status
2930
self.labels = labels
31+
self.non_ephemeral = non_ephemeral
3032

3133
def toJSON(self):
3234
"""Converts the repository to a Gato JSON representation.
@@ -37,7 +39,8 @@ def toJSON(self):
3739
else "Unknown",
3840
"os": self.os if self.os else "Unknown",
3941
"status": self.status if self.status else "Unknown",
40-
"labels": [label for label in self.labels]
42+
"labels": [label for label in self.labels],
43+
"non_ephemeral": self.non_ephemeral
4144
}
4245

4346
return representation

0 commit comments

Comments
 (0)