CybersecurityForDemocracy · macpd · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024
diff --git a/src/tiktok_research_api_helper/api_client.py b/src/tiktok_research_api_helper/api_client.py
@@ -28,6 +28,8 @@
 INVALID_SEARCH_ID_ERROR_RETRY_WAIT = 5
 INVALID_SEARCH_ID_ERROR_MAX_NUM_RETRIES = 5
 
+DAILY_API_REQUEST_QUOTA = 1000
+
 
 class ApiRateLimitError(Exception):
     pass
@@ -90,6 +92,14 @@ class ApiClientConfig:
         default=ApiRateLimitWaitStrategy.WAIT_FOUR_HOURS,
         validator=attrs.validators.instance_of(ApiRateLimitWaitStrategy),  # type: ignore - Attrs overload
     )
+    # Limit on number of API requests. None is no limit. Otherwise client will stop, regardless of
+    # whether API indicates has_more, if it has made this many requests.
+    max_requests: int | None = attrs.field(
+        default=None,
+        validator=attrs.validators.optional(
+            [attrs.validators.instance_of(int), attrs.validators.gt(0)]
+        ),
+    )
 
 
 @attrs.define
@@ -542,7 +552,7 @@ def num_api_requests_sent(self):
 
     @property
     def expected_remaining_api_request_quota(self):
-        return 1000 - self.num_api_requests_sent
+        return DAILY_API_REQUEST_QUOTA - self.num_api_requests_sent
 
     def api_results_iter(self) -> TikTokApiClientFetchResult:
         """Fetches all results from API (ie requests until API indicates query results have been
@@ -557,7 +567,7 @@ def api_results_iter(self) -> TikTokApiClientFetchResult:
         logging.debug("Crawl: %s", crawl)
 
         logging.info("Beginning API results fetch.")
-        while crawl.has_more:
+        while self._should_continue(crawl):
             request = TiktokRequest.from_config(
                 config=self._config,
                 cursor=crawl.cursor,
@@ -594,11 +604,31 @@ def api_results_iter(self) -> TikTokApiClientFetchResult:
                 break
 
         logging.info(
-            "Crawl completed. Num api requests: %s. Expected remaining API request quota: %s",
+            "Crawl completed (or reached configured max_requests: %s). Num api requests: %s. Expected "
+            "remaining API request quota: %s",
+            self._config.max_requests,
             self.num_api_requests_sent,
             self.expected_remaining_api_request_quota,
         )
 
+    def _max_requests_reached(self) -> bool:
+        return self._config.max_requests and self.num_api_requests_sent >= self._config.max_requests
+
+    def _should_continue(self, crawl: Crawl) -> bool:
+        should_continue = crawl.has_more and not self._max_requests_reached()
+        logging.debug(
+            "crawl.has_more: %s, max_requests_reached: %s, should_continue: %s",
+            crawl.has_more,
+            self._max_requests_reached(),
+            should_continue,
+        )
+        if crawl.has_more and self._max_requests_reached():
+            logging.info(
+                "Max requests reached. Will discontinue this crawl even though API response "
+                "indicates more results."
+            )
+        return should_continue
+
     def fetch_all(
         self, *, store_results_after_each_response: bool = False
     ) -> TikTokApiClientFetchResult:

diff --git a/src/tiktok_research_api_helper/cli_data_acquisition.py b/src/tiktok_research_api_helper/cli_data_acquisition.py
@@ -7,12 +7,15 @@
 from pathlib import Path
 from typing import Annotated, Any
 
+import attrs
 import pause
 import pendulum
 import typer
+from sqlalchemy.orm import Session
 
 from tiktok_research_api_helper import region_codes, utils
 from tiktok_research_api_helper.api_client import (
+    DAILY_API_REQUEST_QUOTA,
     ApiClientConfig,
     ApiRateLimitWaitStrategy,
     TikTokApiClient,
@@ -28,11 +31,13 @@
     ExcludeAllKeywordListType,
     ExcludeAnyHashtagListType,
     ExcludeAnyKeywordListType,
+    ExcludeMusicIdListType,
     ExcludeUsernamesListType,
     IncludeAllHashtagListType,
     IncludeAllKeywordListType,
     IncludeAnyHashtagListType,
     IncludeAnyKeywordListType,
+    IncludeMusicIdListType,
     JsonQueryFileType,
     OnlyUsernamesListType,
     RawResponsesOutputDir,
@@ -44,6 +49,7 @@
 from tiktok_research_api_helper.models import (
     get_engine_and_create_tables,
     get_sqlite_engine_and_create_tables,
+    most_used_music_ids,
 )
 from tiktok_research_api_helper.query import (
     Cond,
@@ -62,25 +68,77 @@
 CrawlDateWindow = namedtuple("CrawlDateWindow", ["start_date", "end_date"])
 
 
-def run_long_query(config: ApiClientConfig):
+def run_long_query(config: ApiClientConfig) -> int:
     """Runs a "long" query, defined as one that may need multiple requests to get all the data.
 
     Unless you have a good reason to believe otherwise, queries should default to be considered
-    "long"."""
+    "long".
+
+    Returns:
+        int: number of API requests sent (ie likely amount of API quota consumed).
+    """
     api_client = TikTokApiClient.from_config(config)
-    api_client.fetch_and_store_all()
+    fetch_results = api_client.fetch_and_store_all()
+    # TODO(macpd): fix this return value structure. maybe a namedtuple
+    return {
+        "num_api_requests_sent": api_client.num_api_requests_sent,
+        "crawl_id": fetch_results.crawl.id,
+    }
 
 
-def driver_single_day(config: ApiClientConfig):
-    """Simpler driver for a single day of query"""
+def driver_single_day(config: ApiClientConfig, spider_top_n_music_ids) -> int:
+    """Simpler driver for a single day of query.
+
+    Returns:
+        int: number of API requests sent (ie likely amount of API quota consumed).
+    """
     assert (
         config.start_date == config.end_date
     ), "Start and final date must be the same for single day driver"
 
-    run_long_query(config)
+    return run_long_query(config, spider_top_n_music_ids)
+
+
+def run_spider_top_n_music_ids_query(
+    config: ApiClientConfig,
+    spider_top_n_music_ids: int,
+    crawl_ids: Sequence[int],
+    expected_remaining_api_request_quota: int,
+) -> int:
+    if expected_remaining_api_request_quota <= 0:
+        # TODO(macpd): add some way to bypass this.
+        logging.info("Refusing to spider top music IDs because no API quota remains")
+        return
+
+    with Session(config.engine) as session:
+        top_music_ids = most_used_music_ids(
+            session,
+            limit=None if spider_top_n_music_ids == 0 else spider_top_n_music_ids,
+            crawl_ids=crawl_ids,
+        )
+    new_query = generate_query(
+        include_music_ids=",".join([str(x["music_id"]) for x in top_music_ids])
+    )
 
+    new_crawl_tags = None
+    if config.crawl_tags:
+        new_crawl_tags = [f"{tag}-music-id-spidering" for tag in config.crawl_tags]
 
-def main_driver(config: ApiClientConfig):
+    new_config = attrs.evolve(
+        config,
+        max_requests=expected_remaining_api_request_quota,
+        query=new_query,
+        crawl_tags=new_crawl_tags,
+    )
+
+    api_client = TikTokApiClient.from_config(new_config)
+    api_client.fetch_and_store_all()
+    return api_client.num_api_requests_sent
+
+
+def main_driver(config: ApiClientConfig, spider_top_n_music_ids: int | None = None):
+    num_api_requests_sent = 0
+    crawl_ids = []
     days_per_iter = utils.int_to_days(_DAYS_PER_ITER)
 
     start_date = copy(config.start_date)
@@ -90,24 +148,43 @@ def main_driver(config: ApiClientConfig):
         local_end_date = start_date + days_per_iter
         local_end_date = min(local_end_date, config.end_date)
 
-        new_config = ApiClientConfig(
-            query=config.query,
-            start_date=start_date,
-            end_date=local_end_date,
-            engine=config.engine,
-            stop_after_one_request=config.stop_after_one_request,
-            crawl_tags=config.crawl_tags,
-            raw_responses_output_dir=config.raw_responses_output_dir,
-            api_credentials_file=config.api_credentials_file,
-            api_rate_limit_wait_strategy=config.api_rate_limit_wait_strategy,
-        )
-        run_long_query(new_config)
+        new_config = attrs.evolve(config, start_date=start_date, end_date=local_end_date)
+
+        #  ApiClientConfig(
+        #  query=config.query,
+        #  start_date=start_date,
+        #  end_date=local_end_date,
+        #  engine=config.engine,
+        #  stop_after_one_request=config.stop_after_one_request,
+        #  crawl_tags=config.crawl_tags,
+        #  raw_responses_output_dir=config.raw_responses_output_dir,
+        #  api_credentials_file=config.api_credentials_file,
+        #  api_rate_limit_wait_strategy=config.api_rate_limit_wait_strategy,
+        #  )
+        ret = run_long_query(new_config)
+        num_api_requests_sent += ret["num_api_requests_sent"]
+        crawl_ids.append(ret["crawl_id"])
 
         start_date += days_per_iter
 
         if config.stop_after_one_request:
             logging.log(logging.WARN, "Stopping after one request")
-            break
+            return
+
+    expected_remaining_api_request_quota = 0
+    if spider_top_n_music_ids:
+        if num_api_requests_sent == DAILY_API_REQUEST_QUOTA:
+            # TODO(macpd): handle no remaing quota, perhaps flag to do anyway?
+            logging.warning("Refusing to spider top music IDs because no API quota remains")
+            return
+
+        expected_remaining_api_request_quota = DAILY_API_REQUEST_QUOTA - (num_api_requests_sent % DAILY_API_REQUEST_QUOTA)
+        run_spider_top_n_music_ids_query(
+            config=config,
+            crawl_ids=crawl_ids,
+            spider_top_n_music_ids=spider_top_n_music_ids,
+            expected_remaining_api_request_quota=expected_remaining_api_request_quota,
+        )
 
 
 @APP.command()
@@ -200,6 +277,8 @@ def print_query(
     exclude_all_keywords: ExcludeAllKeywordListType | None = None,
     only_from_usernames: OnlyUsernamesListType | None = None,
     exclude_from_usernames: ExcludeUsernamesListType | None = None,
+    include_music_ids: IncludeMusicIdListType | None = None,
+    exclude_music_ids: ExcludeMusicIdListType | None = None,
 ) -> None:
     """Prints to stdout the query generated from flags. Useful for creating a base from which to
     build more complex custom JSON queries."""
@@ -215,14 +294,16 @@ def print_query(
             exclude_all_keywords,
             only_from_usernames,
             exclude_from_usernames,
+            include_music_ids,
+            exclude_music_ids,
         ]
     ):
         raise typer.BadParameter(
             "must specify at least one of [--include-any-hashtags, --exclude-any-hashtags, "
             "--include-all-hashtags, --exclude-all-hashtags, --include-any-keywords, "
             "--include-all-keywords, --exclude-any-keywords, --exclude-all-keywords, "
             "--include-any-usernames, --include-all-usernames, --exclude-any-usernames, "
-            "--exclude-all-usernames]"
+            "--exclude-all-usernames, --include-music-ids, --exclude-musid-ids]"
         )
     validate_mutually_exclusive_flags(
         {
@@ -407,6 +488,20 @@ def run(
     exclude_all_keywords: ExcludeAllKeywordListType | None = None,
     only_from_usernames: OnlyUsernamesListType | None = None,
     exclude_from_usernames: ExcludeUsernamesListType | None = None,
+    include_music_ids: IncludeMusicIdListType | None = None,
+    exclude_music_ids: ExcludeMusicIdListType | None = None,
+    # TODO(macpd): flag to spider music id, with 0 being all, or postive N being the limit. maybe
+    # only use remaining API quota.
+    spider_top_n_music_ids: Annotated[
+        int,
+        typer.Option(
+            help="After fetching all query results from API, compute most common music_id from "
+            "results and search for videos with the same music_id. Arg should be a positive "
+            "integer which is the max number of most common music_ids to search, while 0 will "
+            "search for all music IDs from the latest crawl."
+        ),
+    ]
+    | None = None,
     debug: EnableDebugLoggingFlag = False,
     # Skips logging init/setup. Hidden because this is intended for other commands that setup
     # logging and then call this as a function.
@@ -445,6 +540,8 @@ def run(
                 exclude_all_keywords,
                 only_from_usernames,
                 exclude_from_usernames,
+                include_music_ids,
+                exclude_music_ids,
                 region,
             ]
         ):
@@ -501,6 +598,8 @@ def run(
             exclude_all_keywords=exclude_all_keywords,
             only_from_usernames=only_from_usernames,
             exclude_from_usernames=exclude_from_usernames,
+            include_music_ids=include_music_ids,
+            exclude_music_ids=exclude_music_ids,
         )
 
     logging.log(logging.INFO, f"Query: {query}")
@@ -528,7 +627,7 @@ def run(
             logging.INFO,
             "Start and final date are the same - running single day driver",
         )
-        driver_single_day(config)
+        driver_single_day(config, spider_top_n_music_ids)
     else:
         logging.log(logging.INFO, "Running main driver")
-        main_driver(config)
+        main_driver(config, spider_top_n_music_ids)
diff --git a/src/tiktok_research_api_helper/custom_types.py b/src/tiktok_research_api_helper/custom_types.py
@@ -181,6 +181,26 @@
     ),
 ]
 
+IncludeMusicIdListType = Annotated[
+    str,
+    typer.Option(
+        help=(
+            "A comma separated list of music_ids. Will query API for videos that have these "
+            "music_ids."
+        )
+    ),
+]
+
+ExcludeMusicIdListType = Annotated[
+    str,
+    typer.Option(
+        help=(
+            "A comma separated list of music_ids. Will query API for videos that DO NOT "
+            "have these music_ids."
+        )
+    ),
+]
+
 CrawlTagType = Annotated[
     str,
     typer.Option(

diff --git a/src/tiktok_research_api_helper/models.py b/src/tiktok_research_api_helper/models.py
@@ -18,6 +18,7 @@
     Table,
     UniqueConstraint,
     create_engine,
+    desc,
     func,
     select,
 )
@@ -202,6 +203,33 @@ def effect_ids(self):
         return {effect.effect_id for effect in self.effects}
 
 
+def most_used_music_ids(
+    session: Session, limit: int | None = None, crawl_ids: Sequence[int] | None = None
+):
+    """Returns dict of most used music_ids with count of video id with that music_id. If crawl_ids
+    specified, only operates on videos associated to those crawl IDs.
+    """
+    if crawl_ids:
+        select_stmt = (
+            select(Video.music_id, func.count(Video.id).label("num_videos"))
+            .join(Video.crawls)
+            .where(Crawl.id.in_(crawl_ids))
+        )
+    else:
+        select_stmt = select(Video.music_id, func.count(Video.id).label("num_videos"))
+    return (
+        session.execute(
+            #  select(Video.music_id, func.count(Video.id).label("num_videos"))
+            select_stmt.where(Video.music_id is not None)
+            .group_by(Video.music_id)
+            .order_by(desc("num_videos"), Video.music_id)
+            .limit(limit)
+        )
+        .mappings()
+        .all()
+    )
+
+
 # TODO(macpd): make generic method for this and use for all many-to-many objects inserted with video
 def _get_hashtag_name_to_hashtag_object_map(
     session: Session, video_data: Sequence[Mapping[str, Any]]