praetorian-inc
diff --git a/‎gato/enumerate/enumerate.py
Lines changed: 13 additions & 2 deletions b/‎gato/enumerate/enumerate.py
Lines changed: 13 additions & 2 deletions
diff --git a/‎gato/enumerate/recommender.py
Lines changed: 6 additions & 1 deletion b/‎gato/enumerate/recommender.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎gato/enumerate/repository.py
Lines changed: 52 additions & 11 deletions b/‎gato/enumerate/repository.py
Lines changed: 52 additions & 11 deletions
diff --git a/‎gato/github/__init__.py
Lines changed: 1 addition & 0 deletions b/‎gato/github/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎gato/github/api.py
Lines changed: 61 additions & 12 deletions b/‎gato/github/api.py
Lines changed: 61 additions & 12 deletions
diff --git a/‎gato/github/gql_queries.py
Lines changed: 54 additions & 0 deletions b/‎gato/github/gql_queries.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎gato/models/runner.py
Lines changed: 5 additions & 2 deletions b/‎gato/models/runner.py
Lines changed: 5 additions & 2 deletions
@@ -1,6 +1,7 @@
 import logging
 
 from gato.github import Api
+from gato.github import GqlQueries
 from gato.models import Repository, Organization
 from gato.cli import Output
 from gato.enumerate.repository import RepositoryEnum
@@ -173,12 +174,22 @@ def enumerate_organization(self, org: str):
             f"the {organization.name} organization!"
         )
 
+        Output.info(f"Querying and caching workflow YAML files!")
+        wf_queries = GqlQueries.get_workflow_ymls(enum_list)
+  
+        for wf_query in wf_queries:
+            result = self.org_e.api.call_post('/graphql', wf_query)
+            # Sometimes we don't get a 200, fall back in this case.
+            if result.status_code == 200:
+                self.repo_e.construct_workflow_cache(result.json()['data']['nodes'])
+            else:
+                Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!")
         for repo in enum_list:
-
             Output.tabbed(
                 f"Enumerating: {Output.bright(repo.name)}!"
             )
-            self.repo_e.enumerate_repository(repo)
+
+            self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 100)
             self.repo_e.enumerate_repository_secrets(repo)
 
             Recommender.print_repo_secrets(
 
@@ -140,7 +140,7 @@ def print_repo_runner_info(repository: Repository):
             Output.result(
                 f"The repository contains a workflow: "
                 f"{Output.bright(repository.sh_workflow_names[0])} that "
-                "executes on self-hosted runners!"
+                "might execute on self-hosted runners!"
             )
 
         if repository.accessible_runners:
@@ -157,6 +157,11 @@ def print_repo_runner_info(repository: Repository):
                 f"{Output.bright(repository.accessible_runners[0].machine_name)}"
             )
 
+            for runner in repository.accessible_runners:
+                if runner.non_ephemeral:
+                    Output.owned("The repository contains a non-ephemeral self-hosted runner!")
+                    break
+
         if repository.runners:
             Output.result(
                 f"The repository has {len(repository.runners)} repo-level"
 
@@ -21,6 +21,7 @@ def __init__(self, api: Api, skip_log: bool, output_yaml):
             api (Api): GitHub API wraper object.
         """
         self.api = api
+        self.workflow_cache = {}
         self.skip_log = skip_log
         self.output_yaml = output_yaml
 
@@ -40,11 +41,12 @@ def __perform_runlog_enumeration(self, repository: Repository):
         )
 
         if wf_runs:
-            runner = Runner(
-                wf_runs[0]['runner_name'], wf_runs[0]['machine_name']
-            )
+            for wf_run in wf_runs:
+                runner = Runner(
+                    wf_run['runner_name'], wf_run['machine_name'], non_ephemeral=wf_run['non_ephemeral']
+                )
 
-            repository.add_accessible_runner(runner)
+                repository.add_accessible_runner(runner)
             runner_detected = True
 
         return runner_detected
@@ -60,12 +62,15 @@ def __perform_yml_enumeration(self, repository: Repository):
             list: List of workflows that execute on sh runner, empty otherwise.
         """
         runner_wfs = []
-        ymls = self.api.retrieve_workflow_ymls(repository.name)
+
+        if repository.name in self.workflow_cache:
+            ymls = self.workflow_cache[repository.name]
+        else:
+            ymls = self.api.retrieve_workflow_ymls(repository.name)
 
         for (wf, yml) in ymls:
             try:
                 parsed_yml = WorkflowParser(yml, repository.name, wf)
-
                 self_hosted_jobs = parsed_yml.self_hosted()
 
                 if self_hosted_jobs:
@@ -79,12 +84,13 @@ def __perform_yml_enumeration(self, repository: Repository):
             # At this point we only know the extension, so handle and
             #  ignore malformed yml files.
             except Exception as parse_error:
-                print(parse_error)
+
+                print(f"{wf}: {str(parse_error)}")
                 logger.warning("Attmpted to parse invalid yaml!")
 
         return runner_wfs
 
-    def enumerate_repository(self, repository: Repository):
+    def enumerate_repository(self, repository: Repository, large_org_enum=False):
         """Enumerate a repository, and check everything relevant to
         self-hosted runner abuse that that the user has permissions to check.
 
@@ -119,15 +125,25 @@ def enumerate_repository(self, repository: Repository):
 
                 repository.set_runners(repo_runners)
 
-        if not self.skip_log and self.__perform_runlog_enumeration(repository):
-            runner_detected = True
-
         workflows = self.__perform_yml_enumeration(repository)
 
         if len(workflows) > 0:
             repository.add_self_hosted_workflows(workflows)
             runner_detected = True
 
+        if not self.skip_log:
+            # If we are enumerating an organization, only enumerate runlogs if
+            # the workflow suggests a sh_runner.
+            if large_org_enum and runner_detected:
+                self.__perform_runlog_enumeration(repository)
+
+            # If we are doing internal enum, get the logs, because coverage is
+            # more important here and it's ok if it takes time.
+            elif not repository.is_public() and self.__perform_runlog_enumeration(repository):
+                runner_detected = True
+            else:
+                runner_detected = self.__perform_runlog_enumeration(repository)
+
         if runner_detected:
             # Only display permissions (beyond having none) if runner is
             # detected.
@@ -158,3 +174,28 @@ def enumerate_repository_secrets(
 
             if org_secrets:
                 repository.set_accessible_org_secrets(org_secrets)
+
+    def construct_workflow_cache(self, yml_results):
+        """Creates a cache of workflow yml files retrieved from graphQL. Since
+        graphql and REST do not have parity, we still need to use rest for most
+        enumeration calls. This method saves off all yml files, so during org
+        level enumeration if we perform yml enumeration the cached file is used
+        instead of making github REST requests. 
+
+        Args:
+            yml_results (list): List of results from individual GraphQL queries
+            (100 nodes at atime).)
+        """
+        for result in yml_results:
+            owner = result['nameWithOwner']
+
+            self.workflow_cache[owner] = list()
+
+            if not result['object']:
+                continue
+
+            for yml_node in result['object']['entries']:
+                yml_name = yml_node['name']
+                if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'):
+                    contents = yml_node['object']['text']
+                    self.workflow_cache[owner].append((yml_name, contents))
@@ -1,2 +1,3 @@
 from .api import Api
+from .gql_queries import GqlQueries
 from .search import Search
@@ -6,9 +6,10 @@
 import zipfile
 import re
 import io
+import json
 
 from gato.cli import Output
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 
 logger = logging.getLogger(__name__)
 
@@ -19,8 +20,9 @@ class Api():
     rate limiting or network issues.
     """
 
-    RUNNER_RE = re.compile(r'Runner name: \'([\w+-]+)\'')
-    MACHINE_RE = re.compile(r'Machine name: \'([\w+-]+)\'')
+    RUNNER_RE = re.compile(r'Runner name: \'([\w+-.]+)\'')
+    MACHINE_RE = re.compile(r'Machine name: \'([\w+-.]+)\'')
+    RUN_THRESHOLD = 90
 
     def __init__(self, pat: str, version: str = "2022-11-28",
                  http_proxy: str = None, socks_proxy: str = None,
@@ -110,12 +112,29 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
         Returns:
             dict: metadata about the run execution.
         """
-        with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
+        log_package = None
+        non_ephemeral = False
 
+        with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
             for zipinfo in runres.infolist():
+                # TODO use a lambda for this messy logic
+                if "Run actionscheckout" in zipinfo.filename:
+                    with runres.open(zipinfo) as run_setup:
+                        content = run_setup.read().decode()
+                        if "Cleaning the repository" in content:
+                            non_ephemeral = True
+
+                        if log_package:
+                            log_package['non_ephemeral'] = non_ephemeral
+
                 if "Set up job" in zipinfo.filename:
                     with runres.open(zipinfo) as run_setup:
                         content = run_setup.read().decode()
+                        if "Image Release: https://github.com/actions/runner-images" in content:
+                            # Larger runners will appear to be self-hosted, but
+                            # they will have the image name. Skip if we see this.
+                            continue
+
                         if "Runner name" in content or \
                                 "Machine name" in content:
 
@@ -132,9 +151,10 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
                                 "runner_name": runner_name,
                                 "machine_name": hostname,
                                 "run_id": run_info["id"],
-                                "run_attempt": run_info["run_attempt"]
+                                "run_attempt": run_info["run_attempt"],
+                                "non_ephemeral": non_ephemeral
                             }
-                            return log_package
+        return log_package
 
     def __get_full_runlog(self, log_content: bytes, run_name: str):
         """Gets the full text of the runlog from the zip file by matching the
@@ -601,30 +621,59 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True):
         Returns:
             list: List of run logs for runs that ran on self-hosted runners.
         """
-        runs = self.call_get(f'/repos/{repo_name}/actions/runs')
+        start_date = datetime.now() - timedelta(days = 60)
+        runs = self.call_get(
+            f'/repos/{repo_name}/actions/runs', params={
+                "per_page": "30",
+                "status":"completed",
+                "exclude_pull_requests": "true",
+                "created":f">{start_date.isoformat()}"
+            }
+        )
 
-        run_logs = []
+        # This is a dictionary so we can de-duplicate runner IDs based on
+        # the machine_name:runner_name.
+        run_logs = {}
+        names = set()
 
         if runs.status_code == 200:
             logger.debug(f'Enumerating runs within {repo_name}')
             for run in runs.json()['workflow_runs']:
+
+                # We are only interested in runs that actually executed.
+                if run['conclusion'] != 'success' and \
+                    run['conclusion'] != 'failure':
+                    continue
+
+                if short_circuit:                
+                    # If we are only looking for the presence of SH runners and
+                    # not trying to determine ephmeral vs not from repeats, then
+                    # we just need to look at each branch + wf combination once.
+                    workflow_key = f"{run['head_branch']}:{run['path']}"
+                    if workflow_key in names:
+                        continue                
+                    names.add(workflow_key)
+
                 run_log = self.call_get(
                     f'/repos/{repo_name}/actions/runs/{run["id"]}/'
                     f'attempts/{run["run_attempt"]}/logs')
-
                 if run_log.status_code == 200:
                     run_log = self.__process_run_log(run_log.content, run)
                     if run_log:
-                        run_logs.append(run_log)
+                        key = f"{run_log['machine_name']}:{run_log['runner_name']}"
+                        run_logs[key] = run_log
+
                         if short_circuit:
-                            return run_logs
+                            return run_logs.values()
+                elif run_log.status_code == 410:
+                    break
                 else:
                     logger.debug(
                         f"Call to retrieve run logs from {repo_name} run "
                         f"{run['id']} attempt {run['run_attempt']} returned "
                         f"{run_log.status_code}!")
 
-        return run_logs
+        return run_logs.values()
 
     def parse_workflow_runs(self, repo_name: str):
         """Returns the number of workflow runs associated with the repository.
 
@@ -0,0 +1,54 @@
+from gato.models import Repository
+
+class GqlQueries():
+    """Constructs graphql queries for use with the GitHub GraphQL api.
+    """
+
+    GET_YMLS = """
+        query RepoFiles($node_ids: [ID!]!) {
+        nodes(ids: $node_ids) {
+            ... on Repository {
+            nameWithOwner
+            object(expression: "HEAD:.github/workflows/") {
+                ... on Tree {
+                entries {
+                    name
+                    type
+                    mode
+                    object {
+                    ... on Blob {
+                        byteSize
+                        text
+                    }
+                    }
+                }
+                }
+            }
+            }
+        }
+        }
+    """
+
+    @staticmethod
+    def get_workflow_ymls(repos: list):
+        """Retrieve workflow yml files for ea
+
+        Args:
+            repos (List[Repository]): List of repository objects
+        Returns:
+            (list): List of JSON post parameters for each graphQL query.
+        """
+        queries = []
+
+        for i in range(0, (len(repos) // 100) + 1):
+
+            top_len = len(repos) if len(repos) < (100 + i*100) else (100 + i*100)
+            query = {
+                "query": GqlQueries.GET_YMLS,
+                "variables": {
+                    "node_ids": [repo.repo_data['node_id'] for repo in repos[0+100*i:top_len]]
+                }
+            }
+
+            queries.append(query)
+        return queries
@@ -11,7 +11,8 @@ def __init__(
             machine_name=None,
             os=None,
             status=None,
-            labels=[]):
+            labels=[],
+            non_ephemeral=False):
         """Constructor for runner wrapper object.
 
         Args:
@@ -27,6 +28,7 @@ def __init__(
         self.os = os
         self.status = status
         self.labels = labels
+        self.non_ephemeral = non_ephemeral
 
     def toJSON(self):
         """Converts the repository to a Gato JSON representation.
@@ -37,7 +39,8 @@ def toJSON(self):
             else "Unknown",
             "os": self.os if self.os else "Unknown",
             "status": self.status if self.status else "Unknown",
-            "labels": [label for label in self.labels]
+            "labels": [label for label in self.labels],
+            "non_ephemeral": self.non_ephemeral
         }
 
         return representation
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .api import Api`
	`2`	`+from .gql_queries import GqlQueries`
`2`	`3`	`from .search import Search`