Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

--spider-top-n-music-ids to fetch other videos with most common music IDs from just completed crawl. #27

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
499d6fb
func to get top N most used (by video count) music_ids, and a unit test
macpd Jun 23, 2024
178466e
rename top_n_music_id -> most_used_music_ids, docstring, and rename a…
macpd Jun 23, 2024
6a4dc80
handle music Ids for include or exclude in generate_query. unit tests
macpd Jun 23, 2024
4beb870
add search by include/exclude music ids to cli.
macpd Jun 23, 2024
ee2c6dc
first pass implementation of spidering most common music ids from las…
macpd Jun 23, 2024
896a528
Merge branch 'main' into feature/spider_by_music_id
macpd Jun 23, 2024
27ef671
Merge branch 'main' into feature/spider_by_music_id
macpd Jun 23, 2024
d35cdf7
keep music_ids as strings in query and query processing
macpd Jun 24, 2024
8a95bae
WIP for passing flag to spider top music IDs
macpd Jun 24, 2024
a9a248e
formatting
macpd Jun 24, 2024
706b3ef
ruff formatting
macpd Jun 24, 2024
47e0075
more formatting fixes
macpd Jun 24, 2024
3bbf368
add limit, max_requests, to API config, and have TiktokApiClient stop…
macpd Jun 24, 2024
91a0420
add tests for max_requests in api_client logic
macpd Jun 24, 2024
617a200
fix pytest.mark.parametrize arg type
macpd Jun 24, 2024
a69d018
log when max_requests has been reached, and change max requests excee…
macpd Jun 24, 2024
def5b81
Merge branch 'main' into feature/spider_by_music_id
macpd Jun 25, 2024
36374bf
Merge branch 'main' into feature/spider_by_music_id
macpd Jul 1, 2024
90ce142
move top music ID spidering code out of run_long_query and into own f…
macpd Jul 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions src/tiktok_research_api_helper/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
INVALID_SEARCH_ID_ERROR_RETRY_WAIT = 5
INVALID_SEARCH_ID_ERROR_MAX_NUM_RETRIES = 5

DAILY_API_REQUEST_QUOTA = 1000


class ApiRateLimitError(Exception):
pass
Expand Down Expand Up @@ -90,6 +92,14 @@ class ApiClientConfig:
default=ApiRateLimitWaitStrategy.WAIT_FOUR_HOURS,
validator=attrs.validators.instance_of(ApiRateLimitWaitStrategy), # type: ignore - Attrs overload
)
# Limit on number of API requests. None is no limit. Otherwise client will stop, regardless of
# whether API indicates has_more, if it has made this many requests.
max_requests: int | None = attrs.field(
default=None,
validator=attrs.validators.optional(
[attrs.validators.instance_of(int), attrs.validators.gt(0)]
),
)


@attrs.define
Expand Down Expand Up @@ -542,7 +552,7 @@ def num_api_requests_sent(self):

@property
def expected_remaining_api_request_quota(self):
return 1000 - self.num_api_requests_sent
return DAILY_API_REQUEST_QUOTA - self.num_api_requests_sent

def api_results_iter(self) -> TikTokApiClientFetchResult:
"""Fetches all results from API (ie requests until API indicates query results have been
Expand All @@ -557,7 +567,7 @@ def api_results_iter(self) -> TikTokApiClientFetchResult:
logging.debug("Crawl: %s", crawl)

logging.info("Beginning API results fetch.")
while crawl.has_more:
while self._should_continue(crawl):
request = TiktokRequest.from_config(
config=self._config,
cursor=crawl.cursor,
Expand Down Expand Up @@ -594,11 +604,31 @@ def api_results_iter(self) -> TikTokApiClientFetchResult:
break

logging.info(
"Crawl completed. Num api requests: %s. Expected remaining API request quota: %s",
"Crawl completed (or reached configured max_requests: %s). Num api requests: %s. Expected "
"remaining API request quota: %s",
self._config.max_requests,
self.num_api_requests_sent,
self.expected_remaining_api_request_quota,
)

def _max_requests_reached(self) -> bool:
return self._config.max_requests and self.num_api_requests_sent >= self._config.max_requests

def _should_continue(self, crawl: Crawl) -> bool:
should_continue = crawl.has_more and not self._max_requests_reached()
logging.debug(
"crawl.has_more: %s, max_requests_reached: %s, should_continue: %s",
crawl.has_more,
self._max_requests_reached(),
should_continue,
)
if crawl.has_more and self._max_requests_reached():
logging.info(
"Max requests reached. Will discontinue this crawl even though API response "
"indicates more results."
)
return should_continue

def fetch_all(
self, *, store_results_after_each_response: bool = False
) -> TikTokApiClientFetchResult:
Expand Down
145 changes: 122 additions & 23 deletions src/tiktok_research_api_helper/cli_data_acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
from pathlib import Path
from typing import Annotated, Any

import attrs
import pause
import pendulum
import typer
from sqlalchemy.orm import Session

from tiktok_research_api_helper import region_codes, utils
from tiktok_research_api_helper.api_client import (
DAILY_API_REQUEST_QUOTA,
ApiClientConfig,
ApiRateLimitWaitStrategy,
TikTokApiClient,
Expand All @@ -28,11 +31,13 @@
ExcludeAllKeywordListType,
ExcludeAnyHashtagListType,
ExcludeAnyKeywordListType,
ExcludeMusicIdListType,
ExcludeUsernamesListType,
IncludeAllHashtagListType,
IncludeAllKeywordListType,
IncludeAnyHashtagListType,
IncludeAnyKeywordListType,
IncludeMusicIdListType,
JsonQueryFileType,
OnlyUsernamesListType,
RawResponsesOutputDir,
Expand All @@ -44,6 +49,7 @@
from tiktok_research_api_helper.models import (
get_engine_and_create_tables,
get_sqlite_engine_and_create_tables,
most_used_music_ids,
)
from tiktok_research_api_helper.query import (
Cond,
Expand All @@ -62,25 +68,77 @@
CrawlDateWindow = namedtuple("CrawlDateWindow", ["start_date", "end_date"])


def run_long_query(config: ApiClientConfig):
def run_long_query(config: ApiClientConfig) -> int:
"""Runs a "long" query, defined as one that may need multiple requests to get all the data.

Unless you have a good reason to believe otherwise, queries should default to be considered
"long"."""
"long".

Returns:
int: number of API requests sent (ie likely amount of API quota consumed).
"""
api_client = TikTokApiClient.from_config(config)
api_client.fetch_and_store_all()
fetch_results = api_client.fetch_and_store_all()
# TODO(macpd): fix this return value structure. maybe a namedtuple
return {
"num_api_requests_sent": api_client.num_api_requests_sent,
"crawl_id": fetch_results.crawl.id,
}


def driver_single_day(config: ApiClientConfig):
"""Simpler driver for a single day of query"""
def driver_single_day(config: ApiClientConfig, spider_top_n_music_ids) -> int:
"""Simpler driver for a single day of query.

Returns:
int: number of API requests sent (ie likely amount of API quota consumed).
"""
assert (
config.start_date == config.end_date
), "Start and final date must be the same for single day driver"

run_long_query(config)
return run_long_query(config, spider_top_n_music_ids)


def run_spider_top_n_music_ids_query(
config: ApiClientConfig,
spider_top_n_music_ids: int,
crawl_ids: Sequence[int],
expected_remaining_api_request_quota: int,
) -> int:
if expected_remaining_api_request_quota <= 0:
# TODO(macpd): add some way to bypass this.
logging.info("Refusing to spider top music IDs because no API quota remains")
return

with Session(config.engine) as session:
top_music_ids = most_used_music_ids(
session,
limit=None if spider_top_n_music_ids == 0 else spider_top_n_music_ids,
crawl_ids=crawl_ids,
)
new_query = generate_query(
include_music_ids=",".join([str(x["music_id"]) for x in top_music_ids])
)

new_crawl_tags = None
if config.crawl_tags:
new_crawl_tags = [f"{tag}-music-id-spidering" for tag in config.crawl_tags]

def main_driver(config: ApiClientConfig):
new_config = attrs.evolve(
config,
max_requests=expected_remaining_api_request_quota,
query=new_query,
crawl_tags=new_crawl_tags,
)

api_client = TikTokApiClient.from_config(new_config)
api_client.fetch_and_store_all()
return api_client.num_api_requests_sent


def main_driver(config: ApiClientConfig, spider_top_n_music_ids: int | None = None):
num_api_requests_sent = 0
crawl_ids = []
days_per_iter = utils.int_to_days(_DAYS_PER_ITER)

start_date = copy(config.start_date)
Expand All @@ -90,24 +148,43 @@ def main_driver(config: ApiClientConfig):
local_end_date = start_date + days_per_iter
local_end_date = min(local_end_date, config.end_date)

new_config = ApiClientConfig(
query=config.query,
start_date=start_date,
end_date=local_end_date,
engine=config.engine,
stop_after_one_request=config.stop_after_one_request,
crawl_tags=config.crawl_tags,
raw_responses_output_dir=config.raw_responses_output_dir,
api_credentials_file=config.api_credentials_file,
api_rate_limit_wait_strategy=config.api_rate_limit_wait_strategy,
)
run_long_query(new_config)
new_config = attrs.evolve(config, start_date=start_date, end_date=local_end_date)

# ApiClientConfig(
# query=config.query,
# start_date=start_date,
# end_date=local_end_date,
# engine=config.engine,
# stop_after_one_request=config.stop_after_one_request,
# crawl_tags=config.crawl_tags,
# raw_responses_output_dir=config.raw_responses_output_dir,
# api_credentials_file=config.api_credentials_file,
# api_rate_limit_wait_strategy=config.api_rate_limit_wait_strategy,
# )
ret = run_long_query(new_config)
num_api_requests_sent += ret["num_api_requests_sent"]
crawl_ids.append(ret["crawl_id"])

start_date += days_per_iter

if config.stop_after_one_request:
logging.log(logging.WARN, "Stopping after one request")
break
return

expected_remaining_api_request_quota = 0
if spider_top_n_music_ids:
if num_api_requests_sent == DAILY_API_REQUEST_QUOTA:
# TODO(macpd): handle no remaing quota, perhaps flag to do anyway?
logging.warning("Refusing to spider top music IDs because no API quota remains")
return

expected_remaining_api_request_quota = DAILY_API_REQUEST_QUOTA - (num_api_requests_sent % DAILY_API_REQUEST_QUOTA)
run_spider_top_n_music_ids_query(
config=config,
crawl_ids=crawl_ids,
spider_top_n_music_ids=spider_top_n_music_ids,
expected_remaining_api_request_quota=expected_remaining_api_request_quota,
)


@APP.command()
Expand Down Expand Up @@ -200,6 +277,8 @@ def print_query(
exclude_all_keywords: ExcludeAllKeywordListType | None = None,
only_from_usernames: OnlyUsernamesListType | None = None,
exclude_from_usernames: ExcludeUsernamesListType | None = None,
include_music_ids: IncludeMusicIdListType | None = None,
exclude_music_ids: ExcludeMusicIdListType | None = None,
) -> None:
"""Prints to stdout the query generated from flags. Useful for creating a base from which to
build more complex custom JSON queries."""
Expand All @@ -215,14 +294,16 @@ def print_query(
exclude_all_keywords,
only_from_usernames,
exclude_from_usernames,
include_music_ids,
exclude_music_ids,
]
):
raise typer.BadParameter(
"must specify at least one of [--include-any-hashtags, --exclude-any-hashtags, "
"--include-all-hashtags, --exclude-all-hashtags, --include-any-keywords, "
"--include-all-keywords, --exclude-any-keywords, --exclude-all-keywords, "
"--include-any-usernames, --include-all-usernames, --exclude-any-usernames, "
"--exclude-all-usernames]"
"--exclude-all-usernames, --include-music-ids, --exclude-musid-ids]"
)
validate_mutually_exclusive_flags(
{
Expand Down Expand Up @@ -407,6 +488,20 @@ def run(
exclude_all_keywords: ExcludeAllKeywordListType | None = None,
only_from_usernames: OnlyUsernamesListType | None = None,
exclude_from_usernames: ExcludeUsernamesListType | None = None,
include_music_ids: IncludeMusicIdListType | None = None,
exclude_music_ids: ExcludeMusicIdListType | None = None,
# TODO(macpd): flag to spider music id, with 0 being all, or postive N being the limit. maybe
# only use remaining API quota.
spider_top_n_music_ids: Annotated[
int,
typer.Option(
help="After fetching all query results from API, compute most common music_id from "
"results and search for videos with the same music_id. Arg should be a positive "
"integer which is the max number of most common music_ids to search, while 0 will "
"search for all music IDs from the latest crawl."
),
]
| None = None,
debug: EnableDebugLoggingFlag = False,
# Skips logging init/setup. Hidden because this is intended for other commands that setup
# logging and then call this as a function.
Expand Down Expand Up @@ -445,6 +540,8 @@ def run(
exclude_all_keywords,
only_from_usernames,
exclude_from_usernames,
include_music_ids,
exclude_music_ids,
region,
]
):
Expand Down Expand Up @@ -501,6 +598,8 @@ def run(
exclude_all_keywords=exclude_all_keywords,
only_from_usernames=only_from_usernames,
exclude_from_usernames=exclude_from_usernames,
include_music_ids=include_music_ids,
exclude_music_ids=exclude_music_ids,
)

logging.log(logging.INFO, f"Query: {query}")
Expand Down Expand Up @@ -528,7 +627,7 @@ def run(
logging.INFO,
"Start and final date are the same - running single day driver",
)
driver_single_day(config)
driver_single_day(config, spider_top_n_music_ids)
else:
logging.log(logging.INFO, "Running main driver")
main_driver(config)
main_driver(config, spider_top_n_music_ids)
20 changes: 20 additions & 0 deletions src/tiktok_research_api_helper/custom_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,26 @@
),
]

IncludeMusicIdListType = Annotated[
str,
typer.Option(
help=(
"A comma separated list of music_ids. Will query API for videos that have these "
"music_ids."
)
),
]

ExcludeMusicIdListType = Annotated[
str,
typer.Option(
help=(
"A comma separated list of music_ids. Will query API for videos that DO NOT "
"have these music_ids."
)
),
]

CrawlTagType = Annotated[
str,
typer.Option(
Expand Down
28 changes: 28 additions & 0 deletions src/tiktok_research_api_helper/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Table,
UniqueConstraint,
create_engine,
desc,
func,
select,
)
Expand Down Expand Up @@ -202,6 +203,33 @@ def effect_ids(self):
return {effect.effect_id for effect in self.effects}


def most_used_music_ids(
session: Session, limit: int | None = None, crawl_ids: Sequence[int] | None = None
):
"""Returns dict of most used music_ids with count of video id with that music_id. If crawl_ids
specified, only operates on videos associated to those crawl IDs.
"""
if crawl_ids:
select_stmt = (
select(Video.music_id, func.count(Video.id).label("num_videos"))
.join(Video.crawls)
.where(Crawl.id.in_(crawl_ids))
)
else:
select_stmt = select(Video.music_id, func.count(Video.id).label("num_videos"))
return (
session.execute(
# select(Video.music_id, func.count(Video.id).label("num_videos"))
select_stmt.where(Video.music_id is not None)
.group_by(Video.music_id)
.order_by(desc("num_videos"), Video.music_id)
.limit(limit)
)
.mappings()
.all()
)


# TODO(macpd): make generic method for this and use for all many-to-many objects inserted with video
def _get_hashtag_name_to_hashtag_object_map(
session: Session, video_data: Sequence[Mapping[str, Any]]
Expand Down
Loading
Loading