diff --git a/advanced_tutorials/tiktok_recsys/LICENSE b/advanced_tutorials/tiktok_recsys/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/advanced_tutorials/tiktok_recsys/README.md b/advanced_tutorials/tiktok_recsys/README.md new file mode 100644 index 00000000..7d74477e --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/README.md @@ -0,0 +1,50 @@ +# Real time feature computation using Apache Flink. + +## Introduction +In this guide you will learn how to create a real-time feature engineering pipeline and write real-time features +and build TikTok stile recommender system using Hopsworks features store. + +## Clone tutorials repository +```bash +git clone https://github.com/logicalclocks/hopsworks-tutorials +cd ~/hopsworks-tutorials/advanced_tutorials/tiktok-recsys +``` + +## Install required python libraries +For the tutorials to work, you need to Install the required python libraries +```bash +cd ./python +pip install -r requirements.txt +``` + +Once you have the above, define the following environment variable: + +## Define env variables +```bash +export HOPSWORKS_HOST=REPLACE_WITH_YOUR_HOPSWORKS_CLUSTER_HOST +export HOPSWORKS_PROJECT_NAME=REPLACE_WITH_YOUR_HOPSWORKS_PROJECT_NAME +export HOPSWORKS_API_KEY=REPLACE_WITH_YOUR_HOPSWORKS_API_KEY +export MAX_ID_RANGE=100 +export RECORDS_PER_SECOND=10 +export PARALLELISM=1 +``` + +## Create a Feature Groups +Full documentation how to create feature group using HSFS APIs can be found [here](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/create/). + +```bash +python ./setup/tiktok_interactions_feature_groups.py +python ./setup/tiktok_user_window_agg_feature_group.py +python ./setup/tiktok_video_window_agg_feature_group.py +``` + +## Flink pipeline: +```bash +cd ~/hopsworks-tutorials/advanced_tutorials/tiktok-recsys/java +mvn clean package +``` +### Submit Flink job +```bash +python3 ./jobs_flink_client.py --host $HOPSWORKS_HOST --api_key $HOPSWORKS_API_KEY --project $HOPSWORKS_PROJECT_NAME --job tikTokInteractions --jar ./target/flink-tiktok-0.1.0.jar --main "ai.hopsworks.tutorials.flink.tiktok.TikTokFlink" --job_arguments "-maxIdRange $MAX_ID_RANGE -recordsPerSecond $RECORDS_PER_SECOND -parallelism $PARALLELISM" +``` + diff --git a/advanced_tutorials/tiktok_recsys/java/dependency-reduced-pom.xml b/advanced_tutorials/tiktok_recsys/java/dependency-reduced-pom.xml new file mode 100644 index 00000000..56d092c5 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/dependency-reduced-pom.xml @@ -0,0 +1,123 @@ + + + 4.0.0 + ai.hopsworks + flink-tiktok + 0.1.0 + + + + maven-jar-plugin + + + + ai.hopsworks.tutorials.flink.tiktok.TikTokFlink + + + + + + maven-shade-plugin + + + package + + shade + + + + + org.apache.flink:force-shading + com.google.code.findbugs:jsr305 + org.slf4j:* + log4j:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + + maven-compiler-plugin + 3.1 + + 1.8 + 1.8 + + + + org.apache.avro + avro-maven-plugin + ${avro.version} + + + generate-sources + + schema + protocol + idl-protocol + + + + + + + + + + + Hops + Hops Repository + https://archiva.hops.works/repository/Hops/ + + + + + org.apache.flink + flink-core + 1.17.0 + provided + + + org.apache.flink + flink-streaming-java + 1.17.0 + provided + + + org.apache.flink + flink-connector-kafka + 1.17.0 + provided + + + flink-shaded-hadoop2 + org.apache.flink + + + + + + 1.8.2 + 1.17.0 + 3.7.1 + UTF-8 + 8 + 8 + 4.13.2 + + diff --git a/advanced_tutorials/tiktok_recsys/java/jobs_flink_client.py b/advanced_tutorials/tiktok_recsys/java/jobs_flink_client.py new file mode 100644 index 00000000..70a35fc4 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/jobs_flink_client.py @@ -0,0 +1,88 @@ +import hopsworks +import argparse +import time + +def connect(args): + project = hopsworks.login( + host=args.host, port=args.port, project=args.project, api_key_value=args.api_key + ) + return project.get_flink_cluster_api() + + +def setup_cluster(flink_cluster_api, args): + flink_job_config = {'type': 'flinkJobConfiguration', 'amQueue': 'default', 'amMemory': args.job_manager_mbs, + 'amVCores': 1, 'jobmanager.heap.size': args.job_manager_mbs, 'taskmanager.numberOfTaskSlots': 1, + 'taskmanager.heap.size': args.task_manager_mbs, 'jobType': 'FLINK', "appName": args.job} + + # producer job + try: + producer_cluster = flink_cluster_api.get_cluster(args.job) + flink_cluster_jobs = producer_cluster.get_jobs() + for job_id in flink_cluster_jobs: + job_state = producer_cluster.job_state(job_id) + if job_state == "RUNNING": + flink_cluster_jobs.stop_job(job_id=job_id) + return producer_cluster + except: + return flink_cluster_api.setup_cluster(name=args.job, config=flink_job_config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Hopsworks cluster configuration + parser.add_argument("--host", help="Hopsworks cluster host") + parser.add_argument( + "--port", help="Port on which Hopsworks is listening on", default=443 + ) + parser.add_argument("--api_key", help="API key to authenticate with Hopsworks") + parser.add_argument("--project", help="Name of the Hopsworks project to connect to") + + # Flink cluster configuration + parser.add_argument( + "--job", default="flinkcluster", help="Flink job name in Hopsworks" + ) + parser.add_argument( + "--job_manager_mbs", + default=4048, + help="Memory of the Flink job manager in MB", + ) + parser.add_argument( + "--task_manager_mbs", + default=4048, + help="Memory of the Flink task managers in MB", + ) + parser.add_argument("--slots", default=1, help="Number of slots per TaskManager") + + # User application configuration + parser.add_argument("--jar", help="The Flink job jar file") + parser.add_argument( + "--main", + help="The entry point to the application, file with main function", + ) + parser.add_argument("--job_arguments", help="Flink job runtime arguments") + + args = parser.parse_args() + + # Setup connection to Hopsworks + jobs_api = connect(args) + + # Setup Flink cluster + flink_cluster = setup_cluster(jobs_api, args) + + if flink_cluster._count_ongoing_executions() > 0: + flink_cluster_execution = jobs_api.get_cluster(args.job) + else: + flink_cluster_execution = flink_cluster.start() + + flink_cluster_execution.upload_jar(args.jar) + + # Submit user jar + jar_metadatas = flink_cluster_execution.get_jars() + jar_metadata = jar_metadatas[0] + jar_id = jar_metadata["id"] + job_id = flink_cluster_execution.submit_job(jar_id, args.main, job_arguments=args.job_arguments) + + while True: + flink_cluster_job = flink_cluster_execution.get_job(job_id) + print("Flink job is: {}".format(flink_cluster_job["plan"]["type"])) + time.sleep(20) diff --git a/advanced_tutorials/tiktok_recsys/java/pom.xml b/advanced_tutorials/tiktok_recsys/java/pom.xml new file mode 100644 index 00000000..fec87272 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/pom.xml @@ -0,0 +1,185 @@ + + + 4.0.0 + + ai.hopsworks + flink-tiktok + 0.1.0 + + + 8 + 8 + UTF-8 + 3.7.1 + 4.13.2 + 1.8.2 + 1.17.0 + + + + + org.apache.flink + flink-core + ${flink.version} + provided + + + + org.apache.flink + flink-streaming-java + ${flink.version} + provided + + + + org.apache.flink + flink-avro + ${flink.version} + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.flink + flink-connector-kafka + ${flink.version} + provided + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + org.apache.flink + flink-connector-datagen + ${flink.version} + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.avro + avro + ${avro.version} + + + + com.logicalclocks + hsfs-flink + ${hsfs.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + ai.hopsworks.tutorials.flink.tiktok.TikTokFlink + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + + package + + shade + + + + + org.apache.flink:force-shading + com.google.code.findbugs:jsr305 + org.slf4j:* + log4j:* + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.8 + 1.8 + + + + + org.apache.avro + avro-maven-plugin + ${avro.version} + + + generate-sources + + schema + protocol + idl-protocol + + + + + + + + + + Hops + Hops Repository + https://archiva.hops.works/repository/Hops/ + + true + + + true + + + + \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_interactions.avsc b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_interactions.avsc new file mode 100644 index 00000000..c9305074 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_interactions.avsc @@ -0,0 +1,63 @@ +{ + "type" : "record", + "name" : "SourceInteractions", + "namespace" : "ai.hopsworks.tutorials.flink.tiktok.features", + "fields": [ + { + "name": "id", + "type": [ + "null", + "long" + ] + }, + { + "name": "user_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "video_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "category_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_type", + "type": [ + "null", + "string" + ] + }, + { + "name": "watch_time", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_date", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_month", + "type": [ + "null", + "string" + ] + } + ] +} \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_user_agg.avsc b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_user_agg.avsc new file mode 100644 index 00000000..2e16574e --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_user_agg.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "UserWindowAggregationSchema", + "namespace" : "ai.hopsworks.tutorials.flink.tiktok.features", + "fields": [ + { + "name": "user_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "category_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "window_end_time", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ] + }, + { + "name": "interaction_month", + "type": [ + "null", + "string" + ] + }, + { + "name": "like_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "dislike_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "view_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "comment_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "share_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "skip_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "total_watch_time", + "type": [ + "null", + "long" + ] + } + ] +} \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_video_agg.avsc b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_video_agg.avsc new file mode 100644 index 00000000..65910c8f --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/avro/tiktok_video_agg.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "VideoWindowAggregationSchema", + "namespace" : "ai.hopsworks.tutorials.flink.tiktok.features", + "fields": [ + { + "name": "video_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "category_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "window_end_time", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ] + }, + { + "name": "interaction_month", + "type": [ + "null", + "string" + ] + }, + { + "name": "like_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "dislike_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "view_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "comment_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "share_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "skip_count", + "type": [ + "null", + "long" + ] + }, + { + "name": "total_watch_time", + "type": [ + "null", + "long" + ] + } + ] +} \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/TikTokFlink.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/TikTokFlink.java new file mode 100644 index 00000000..b08da8fb --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/TikTokFlink.java @@ -0,0 +1,54 @@ +package ai.hopsworks.tutorials.flink.tiktok; + +import ai.hopsworks.tutorials.flink.tiktok.pipelines.TikTokStreamFeatureAggregations; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + +public class TikTokFlink { + public static void main(String[] args) throws Exception { + + Options options = new Options(); + + options.addOption(Option.builder("maxIdRange") + .argName("maxIdRange") + .required(false) + .hasArg() + .build()); + + options.addOption(Option.builder("recordsPerSecond") + .argName("recordsPerSecond") + .required(false) + .hasArg() + .build()); + + options.addOption(Option.builder("parallelism") + .argName("parallelism") + .required(false) + .hasArg() + .build()); + + CommandLineParser parser = new DefaultParser(); + CommandLine commandLine = parser.parse(options, args); + + Long maxId = 100000000L; + if (commandLine.hasOption("maxIdRange")) { + maxId = Long.parseLong(commandLine.getOptionValue("maxIdRange")); + } + + Long recordsPerSecond = 1000000L; + if (commandLine.hasOption("recordsPerSecond")) { + recordsPerSecond = Long.parseLong(commandLine.getOptionValue("recordsPerSecond")); + } + + Integer parallelism = 200; + if (commandLine.hasOption("parallelism")) { + parallelism = Integer.parseInt(commandLine.getOptionValue("parallelism")); + } + + new TikTokStreamFeatureAggregations().stream(maxId, recordsPerSecond, parallelism); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/Interactions.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/Interactions.java new file mode 100644 index 00000000..58ec78fc --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/Interactions.java @@ -0,0 +1,45 @@ +package ai.hopsworks.tutorials.flink.tiktok.features; + +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.metrics.DescriptiveStatisticsHistogram; + +import java.time.Instant; + +public class Interactions extends RichMapFunction { + + private static final int EVENT_TIME_LAG_WINDOW_SIZE = 10_000; + + private transient DescriptiveStatisticsHistogram eventTimeLag; + + @Override + public SourceInteractions map(TikTokInteractions source) throws Exception { + SourceInteractions interactionsFeatureGroupSchema = new SourceInteractions(); + interactionsFeatureGroupSchema.setId(source.getInteractionId()); + interactionsFeatureGroupSchema.setUserId(source.getUserId()); + interactionsFeatureGroupSchema.setVideoId(source.getVideoId()); + interactionsFeatureGroupSchema.setCategoryId(source.getCategoryId()); + interactionsFeatureGroupSchema.setInteractionType(source.getInteractionType()); + interactionsFeatureGroupSchema.setInteractionDate(source.getInteractionDate() * 1000); + interactionsFeatureGroupSchema.setInteractionMonth(source.getInteractionMonth()); + interactionsFeatureGroupSchema.setWatchTime(source.getWatchTime()); + + // update eventTimeLag + eventTimeLag.update(Instant.now().toEpochMilli() - source.getProcessStart()); + + return interactionsFeatureGroupSchema; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + + eventTimeLag = + getRuntimeContext() + .getMetricGroup() + .histogram( + "interactionsTimeLag", + new DescriptiveStatisticsHistogram(EVENT_TIME_LAG_WINDOW_SIZE)); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementAggregation.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementAggregation.java new file mode 100644 index 00000000..0e49fd5b --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementAggregation.java @@ -0,0 +1,95 @@ +package ai.hopsworks.tutorials.flink.tiktok.features; + +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; +import org.apache.flink.api.common.functions.AggregateFunction; + +import java.time.Instant; + +public class UserEngagementAggregation + implements AggregateFunction { + + public UserEngagementAggregation() { + } + + @Override + public UserWindowAggregationSchema createAccumulator() { + return new UserWindowAggregationSchema(); + } + + @Override + public UserWindowAggregationSchema add(TikTokInteractions record, UserWindowAggregationSchema + accumulator) { + + accumulator.setUserId(record.getUserId()); + accumulator.setInteractionMonth(record.getInteractionMonth()); + accumulator.setCategoryId(record.getCategoryId()); + + // to measure latency, will be overwritten later + accumulator.setWindowEndTime(record.getProcessStart()); + + switch(record.getInteractionType()) { + case "like": + accumulator.setLikeCount(engagementDefaultValue(accumulator.getLikeCount()) + 1); + break; + case "dislike": + accumulator.setDislikeCount(engagementDefaultValue(accumulator.getDislikeCount()) + 1); + break; + case "view": + accumulator.setViewCount(engagementDefaultValue(accumulator.getViewCount()) + 1); + break; + case "comment": + accumulator.setCommentCount(engagementDefaultValue(accumulator.getCommentCount()) + 1); + break; + case "share": + accumulator.setShareCount(engagementDefaultValue(accumulator.getShareCount()) + 1); + break; + case "skip": + accumulator.setSkipCount(engagementDefaultValue(accumulator.getShareCount()) + 1); + break; + } + accumulator.setTotalWatchTime(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(record.getWatchTime())); + + return accumulator; + } + + @Override + public UserWindowAggregationSchema getResult(UserWindowAggregationSchema accumulator) { + UserWindowAggregationSchema userWindowAggregationSchema = new UserWindowAggregationSchema(); + userWindowAggregationSchema.setUserId(accumulator.getUserId()); + userWindowAggregationSchema.setInteractionMonth(accumulator.getInteractionMonth()); + + userWindowAggregationSchema.setLikeCount(engagementDefaultValue(accumulator.getLikeCount())); + userWindowAggregationSchema.setLikeCount(engagementDefaultValue(accumulator.getLikeCount())); + userWindowAggregationSchema.setViewCount(engagementDefaultValue(accumulator.getViewCount())); + userWindowAggregationSchema.setCommentCount(engagementDefaultValue(accumulator.getCommentCount())); + userWindowAggregationSchema.setShareCount(engagementDefaultValue(accumulator.getShareCount())); + userWindowAggregationSchema.setSkipCount(engagementDefaultValue(accumulator.getSkipCount())); + userWindowAggregationSchema.setTotalWatchTime(engagementDefaultValue(accumulator.getTotalWatchTime())); + return userWindowAggregationSchema; + } + + @Override + public UserWindowAggregationSchema merge(UserWindowAggregationSchema accumulator, + UserWindowAggregationSchema accumulator1) { + accumulator.setLikeCount(engagementDefaultValue(accumulator.getLikeCount()) + + engagementDefaultValue(accumulator1.getLikeCount())); + accumulator.setDislikeCount(engagementDefaultValue(accumulator.getDislikeCount()) + + engagementDefaultValue(accumulator1.getDislikeCount())); + accumulator.setViewCount(engagementDefaultValue(accumulator.getViewCount()) + + engagementDefaultValue(accumulator1.getViewCount())); + accumulator.setCommentCount(engagementDefaultValue(accumulator.getCommentCount()) + + engagementDefaultValue(accumulator1.getCommentCount())); + accumulator.setShareCount(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(accumulator1.getShareCount())); + accumulator.setSkipCount(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(accumulator1.getShareCount())); + accumulator.setTotalWatchTime(engagementDefaultValue(accumulator.getTotalWatchTime()) + + engagementDefaultValue(accumulator1.getTotalWatchTime())); + return accumulator; + } + + private Long engagementDefaultValue(Long engagementValue) { + return engagementValue == null ? 0: engagementValue; + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementProcessWindow.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementProcessWindow.java new file mode 100644 index 00000000..3adfac33 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/UserEngagementProcessWindow.java @@ -0,0 +1,47 @@ +package ai.hopsworks.tutorials.flink.tiktok.features; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.metrics.DescriptiveStatisticsHistogram; +import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction; +import org.apache.flink.streaming.api.windowing.windows.TimeWindow; +import org.apache.flink.util.Collector; + +import java.time.Instant; + +public class UserEngagementProcessWindow extends ProcessWindowFunction { + + private static final int EVENT_TIME_LAG_WINDOW_SIZE = 10_000; + + private transient DescriptiveStatisticsHistogram eventTimeLag; + + @Override + public void process(Long userId, ProcessWindowFunction.Context context, Iterable iterable, Collector collector) { + + UserWindowAggregationSchema record = iterable.iterator().next(); + + // get process start timestamp + Long processStart = record.getWindowEndTime(); + + // window end + record.setWindowEndTime(context.window().getEnd() * 1000); + + // here it ends + collector.collect(record); + + // measure latency + //eventTimeLag.update(Instant.now().toEpochMilli() - processStart); + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + + eventTimeLag = + getRuntimeContext() + .getMetricGroup() + .histogram( + "userEngagementEventTimeLag", + new DescriptiveStatisticsHistogram(EVENT_TIME_LAG_WINDOW_SIZE)); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementAggregation.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementAggregation.java new file mode 100644 index 00000000..23b1088a --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementAggregation.java @@ -0,0 +1,88 @@ +package ai.hopsworks.tutorials.flink.tiktok.features; + +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; +import org.apache.flink.api.common.functions.AggregateFunction; + +public class VideoEngagementAggregation + implements AggregateFunction { + + public VideoEngagementAggregation() { + }; + + @Override + public VideoWindowAggregationSchema createAccumulator() { + return new VideoWindowAggregationSchema(); + } + + @Override + public VideoWindowAggregationSchema add(TikTokInteractions record, VideoWindowAggregationSchema + accumulator) { + accumulator.setVideoId(record.getVideoId()); + accumulator.setInteractionMonth(record.getInteractionMonth()); + + switch(String.valueOf(record.getInteractionType())) { + case "like": + accumulator.setLikeCount(engagementDefaultValue(accumulator.getLikeCount()) + 1); + break; + case "dislike": + accumulator.setDislikeCount(engagementDefaultValue(accumulator.getDislikeCount()) + 1); + break; + case "view": + accumulator.setViewCount(engagementDefaultValue(accumulator.getViewCount()) + 1); + break; + case "comment": + accumulator.setCommentCount(engagementDefaultValue(accumulator.getCommentCount()) + 1); + break; + case "share": + accumulator.setShareCount(engagementDefaultValue(accumulator.getShareCount()) + 1); + break; + case "skip": + accumulator.setSkipCount(engagementDefaultValue(accumulator.getShareCount()) + 1); + break; + } + + return accumulator; + } + + @Override + public VideoWindowAggregationSchema getResult(VideoWindowAggregationSchema accumulator) { + VideoWindowAggregationSchema videoWindowAggregationSchema = new VideoWindowAggregationSchema(); + videoWindowAggregationSchema.setVideoId(accumulator.getVideoId()); + videoWindowAggregationSchema.setInteractionMonth(accumulator.getInteractionMonth()); + + videoWindowAggregationSchema.setLikeCount(engagementDefaultValue(accumulator.getLikeCount())); + videoWindowAggregationSchema.setLikeCount(engagementDefaultValue(accumulator.getLikeCount())); + videoWindowAggregationSchema.setViewCount(engagementDefaultValue(accumulator.getViewCount())); + videoWindowAggregationSchema.setCommentCount(engagementDefaultValue(accumulator.getCommentCount())); + videoWindowAggregationSchema.setShareCount(engagementDefaultValue(accumulator.getShareCount())); + videoWindowAggregationSchema.setSkipCount(engagementDefaultValue(accumulator.getSkipCount())); + videoWindowAggregationSchema.setTotalWatchTime(engagementDefaultValue(accumulator.getTotalWatchTime())); + + return videoWindowAggregationSchema; + } + + @Override + public VideoWindowAggregationSchema merge(VideoWindowAggregationSchema accumulator, + VideoWindowAggregationSchema accumulator1) { + + accumulator.setLikeCount(engagementDefaultValue(accumulator.getLikeCount()) + + engagementDefaultValue(accumulator1.getLikeCount())); + accumulator.setDislikeCount(engagementDefaultValue(accumulator.getDislikeCount()) + + engagementDefaultValue(accumulator1.getDislikeCount())); + accumulator.setViewCount(engagementDefaultValue(accumulator.getViewCount()) + + engagementDefaultValue(accumulator1.getViewCount())); + accumulator.setCommentCount(engagementDefaultValue(accumulator.getCommentCount()) + + engagementDefaultValue(accumulator1.getCommentCount())); + accumulator.setShareCount(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(accumulator1.getShareCount())); + accumulator.setSkipCount(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(accumulator1.getShareCount())); + accumulator.setTotalWatchTime(engagementDefaultValue(accumulator.getShareCount()) + + engagementDefaultValue(accumulator1.getTotalWatchTime())); + return accumulator; + } + + private Long engagementDefaultValue(Long engagementValue) { + return engagementValue == null ? 0: engagementValue; + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementProcessWindow.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementProcessWindow.java new file mode 100644 index 00000000..9f87daf5 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/features/VideoEngagementProcessWindow.java @@ -0,0 +1,50 @@ +package ai.hopsworks.tutorials.flink.tiktok.features; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.metrics.DescriptiveStatisticsHistogram; +import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction; +import org.apache.flink.streaming.api.windowing.windows.TimeWindow; +import org.apache.flink.util.Collector; + +import java.time.Instant; + +public class VideoEngagementProcessWindow + extends ProcessWindowFunction { + + + private static final int EVENT_TIME_LAG_WINDOW_SIZE = 10_000; + + private transient DescriptiveStatisticsHistogram eventTimeLag; + + public VideoEngagementProcessWindow() { + } + + @Override + public void process(Long videoId, ProcessWindowFunction.Context context, + Iterable iterable, + Collector collector) throws Exception { + VideoWindowAggregationSchema record = iterable.iterator().next(); + + // get process start timestamp + Long processStart = record.getWindowEndTime(); + + record.setWindowEndTime(context.window().getEnd() * 1000); + + // here it ends + //eventTimeLag.update(Instant.now().toEpochMilli() - processStart); + collector.collect(record); + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + + eventTimeLag = + getRuntimeContext() + .getMetricGroup() + .histogram( + "videoEngagementEventTimeLag", + new DescriptiveStatisticsHistogram(EVENT_TIME_LAG_WINDOW_SIZE)); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/InteractionsEventsGenerator.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/InteractionsEventsGenerator.java new file mode 100644 index 00000000..5f743a7e --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/InteractionsEventsGenerator.java @@ -0,0 +1,118 @@ +package ai.hopsworks.tutorials.flink.tiktok.pipelines; + +import ai.hopsworks.tutorials.flink.tiktok.features.SourceInteractions; +import ai.hopsworks.tutorials.flink.tiktok.simulators.InteractionsGenerator; +import ai.hopsworks.tutorials.flink.tiktok.utils.InteractionsEventKafkaSync; +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; +import ai.hopsworks.tutorials.flink.tiktok.utils.Utils; + +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.datagen.source.DataGeneratorSource; +import org.apache.flink.connector.kafka.sink.KafkaSink; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.Properties; + +public class InteractionsEventsGenerator { + Utils utils = new Utils(); + public void run(String topicName, Long recordsPerSecond, Integer parallelism) throws Exception { + + // Define time for start + Instant now = Instant.now(); + // Subtract 2 weeks from the current instant + Instant startTime = now.minus(7, ChronoUnit.DAYS); + + // set up streaming execution environment + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(parallelism); + + DataGeneratorSource generatorSource = + new DataGeneratorSource<>( + new InteractionsGenerator(recordsPerSecond, startTime), + Long.MAX_VALUE, + RateLimiterStrategy.perSecond(recordsPerSecond), + TypeInformation.of(TikTokInteractions.class)); + + DataStream simEvents = + env.fromSource(generatorSource, + WatermarkStrategy.noWatermarks(), + "Generator Source") + //.setParallelism(parallelism) + .rescale() + .rebalance() + .keyBy(TikTokInteractions::getUserId) + .map(new MapFunction() { + @Override + public SourceInteractions map(TikTokInteractions tikTokInteractions) throws Exception { + SourceInteractions sourceInteractions = new SourceInteractions(); + sourceInteractions.setId(tikTokInteractions.getInteractionId()); + sourceInteractions.setUserId(tikTokInteractions.getUserId()); + sourceInteractions.setVideoId(tikTokInteractions.getVideoId()); + sourceInteractions.setCategoryId(tikTokInteractions.getCategoryId()); + sourceInteractions.setInteractionType(tikTokInteractions.getInteractionType()); + sourceInteractions.setInteractionDate(tikTokInteractions.getInteractionDate()); + sourceInteractions.setInteractionMonth(tikTokInteractions.getInteractionMonth()); + sourceInteractions.setWatchTime(tikTokInteractions.getWatchTime()); + return sourceInteractions; + } + }); + + Properties kafkaConfig = utils.getKafkaProperties(topicName); + + KafkaSink sink = KafkaSink.builder() + .setKafkaProducerConfig(kafkaConfig) + .setBootstrapServers(kafkaConfig.getProperty("bootstrap.servers")) + .setRecordSerializer(new InteractionsEventKafkaSync(topicName)) + .setDeliveryGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) + .build(); + + simEvents.sinkTo(sink); + + env.execute(); + } + public static void main(String[] args) throws Exception { + + Options options = new Options(); + + options.addOption(Option.builder("topicName") + .argName("topicName") + .required(true) + .hasArg() + .build()); + + options.addOption(Option.builder("recordsPerSecond") + .argName("recordsPerSecond") + .required(true) + .hasArg() + .build()); + + options.addOption(Option.builder("parallelism") + .argName("parallelism") + .required(true) + .hasArg() + .build()); + + CommandLineParser parser = new DefaultParser(); + CommandLine commandLine = parser.parse(options, args); + + String topicName = commandLine.getOptionValue("topicName"); + Long recordsPerSecond = Long.parseLong(commandLine.getOptionValue("recordsPerSecond")); + Integer parallelism = Integer.parseInt(commandLine.getOptionValue("parallelism")); + + InteractionsEventsGenerator interactionsEventsProducer = new InteractionsEventsGenerator(); + interactionsEventsProducer.run(topicName, recordsPerSecond, parallelism); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/TikTokStreamFeatureAggregations.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/TikTokStreamFeatureAggregations.java new file mode 100644 index 00000000..fe146cc6 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/pipelines/TikTokStreamFeatureAggregations.java @@ -0,0 +1,130 @@ +package ai.hopsworks.tutorials.flink.tiktok.pipelines; + +import ai.hopsworks.tutorials.flink.tiktok.features.SourceInteractions; +import ai.hopsworks.tutorials.flink.tiktok.features.UserWindowAggregationSchema; +import ai.hopsworks.tutorials.flink.tiktok.features.UserEngagementAggregation; +import ai.hopsworks.tutorials.flink.tiktok.features.UserEngagementProcessWindow; +import ai.hopsworks.tutorials.flink.tiktok.features.VideoWindowAggregationSchema; +import ai.hopsworks.tutorials.flink.tiktok.features.VideoEngagementAggregation; +import ai.hopsworks.tutorials.flink.tiktok.features.VideoEngagementProcessWindow; +import ai.hopsworks.tutorials.flink.tiktok.simulators.InteractionsGenerator; +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; + +import com.logicalclocks.hsfs.flink.FeatureStore; +import com.logicalclocks.hsfs.flink.HopsworksConnection; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; + +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.connector.datagen.source.DataGeneratorSource; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +import java.time.Duration; +import java.time.Instant; +import java.time.temporal.ChronoUnit; + +public class TikTokStreamFeatureAggregations { + + public static final int CHECKPOINTING_INTERVAL_MS = 5000; + private static final String JOB_NAME = "TikTok Streaming Pipeline"; + + private FeatureStore featureStore; + + public TikTokStreamFeatureAggregations() throws Exception { + //get feature store handle + HopsworksConnection hopsworksConnection = HopsworksConnection.builder().build(); + + featureStore = hopsworksConnection.getFeatureStore(); + } + + public void stream(Long maxId, Long recordsPerSecond, Integer parallelism) throws Exception { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(parallelism); + + // Setup the sliding window aggregations 5, 10, 60 minutes + //interactionSlidingWindow( env,60, 30, maxId, recordsPerSecond, parallelism); + interactionSlidingWindow( env,10, 5, maxId, recordsPerSecond, parallelism); + + env.execute(JOB_NAME); + //env.enableCheckpointing(CHECKPOINTING_INTERVAL_MS); + env.setRestartStrategy(RestartStrategies.noRestart()); + } + + private void interactionSlidingWindow(StreamExecutionEnvironment env, + int windowSizeMinutes, + int slideSizeMinutes, + Long maxId, + Long recordsPerSecond, + int parallelism) throws Exception { + + // Define time for start + Instant now = Instant.now(); + // Subtract 2 weeks from the current instant + Instant startTime = now.minus(7, ChronoUnit.DAYS); + + // get or create stream feature group + StreamFeatureGroup interactionsFeatureGroup = featureStore.getStreamFeatureGroup("interactions", 1); + StreamFeatureGroup userWindowAgg = featureStore.getStreamFeatureGroup("user_window_agg_1h", 1); + StreamFeatureGroup videoWindowAgg = featureStore.getStreamFeatureGroup("video_window_agg_1h", 1); + + WatermarkStrategy customWatermark = WatermarkStrategy + .forBoundedOutOfOrderness(Duration.ofSeconds(30)) + .withTimestampAssigner((event, timestamp) -> event.getInteractionDate()); + + DataGeneratorSource generatorSource = + new DataGeneratorSource<>( + new InteractionsGenerator(maxId, startTime), + Long.MAX_VALUE, + RateLimiterStrategy.perSecond(recordsPerSecond), + TypeInformation.of(TikTokInteractions.class)); + + DataStream simEvents = + env.fromSource(generatorSource, + WatermarkStrategy.noWatermarks(), + "Generator Source") + .setParallelism(parallelism) + .rescale() + .rebalance(); + + // define feature aggregate streams + DataStream sourceInteractions = + simEvents + .keyBy(TikTokInteractions::getUserId) + .map((MapFunction) tikTokInteractions -> { + SourceInteractions sourceInteractions1 = new SourceInteractions(); + sourceInteractions1.setId(tikTokInteractions.getInteractionId()); + sourceInteractions1.setUserId(tikTokInteractions.getUserId()); + sourceInteractions1.setVideoId(tikTokInteractions.getVideoId()); + sourceInteractions1.setCategoryId(tikTokInteractions.getCategoryId()); + sourceInteractions1.setInteractionType(tikTokInteractions.getInteractionType()); + sourceInteractions1.setInteractionDate(tikTokInteractions.getInteractionDate() * 1000); + sourceInteractions1.setInteractionMonth(tikTokInteractions.getInteractionMonth()); + sourceInteractions1.setWatchTime(tikTokInteractions.getWatchTime()); + return sourceInteractions1; + }); + + DataStream userAggregationStream = + simEvents.assignTimestampsAndWatermarks(customWatermark) + .keyBy(TikTokInteractions::getUserId) + .window(SlidingEventTimeWindows.of(Time.minutes(windowSizeMinutes), Time.minutes(slideSizeMinutes))) + .aggregate(new UserEngagementAggregation(), new UserEngagementProcessWindow()); + + DataStream videoAggregationStream = + simEvents.assignTimestampsAndWatermarks(customWatermark) + .keyBy(TikTokInteractions::getVideoId) + .window(SlidingEventTimeWindows.of(Time.minutes(windowSizeMinutes), Time.minutes(slideSizeMinutes))) + .aggregate(new VideoEngagementAggregation(), new VideoEngagementProcessWindow()); + + // insert streams + interactionsFeatureGroup.insertStream(sourceInteractions); + userWindowAgg.insertStream(userAggregationStream); + videoWindowAgg.insertStream(videoAggregationStream); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/simulators/InteractionsGenerator.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/simulators/InteractionsGenerator.java new file mode 100644 index 00000000..743421b7 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/simulators/InteractionsGenerator.java @@ -0,0 +1,106 @@ +package ai.hopsworks.tutorials.flink.tiktok.simulators; + +import ai.hopsworks.tutorials.flink.tiktok.utils.TikTokInteractions; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.datagen.source.GeneratorFunction; + +import java.text.SimpleDateFormat; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +public class InteractionsGenerator implements GeneratorFunction { + + private final long maxInteractionId; + + private long interactionId = 0; + + private final Random randomNumber = new Random(); + + private final List interactionTypes = Arrays.asList("like", "view", "dislike", "comment", "share", "skip"); + private final List videoCategories = Arrays.asList(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L); + + SimpleDateFormat monthFormat = new SimpleDateFormat("yyyy-MM"); + + Instant startTime; + + public InteractionsGenerator(long maxInteractionId, Instant startTime) { + this.maxInteractionId = maxInteractionId; + this.startTime = startTime; + } + + @Override + public void open(SourceReaderContext readerContext) throws Exception { + GeneratorFunction.super.open(readerContext); + } + + @Override + public void close() throws Exception { + GeneratorFunction.super.close(); + } + + @Override + public TikTokInteractions map(Long aLong) throws Exception { + return interactionEventGenerator(userIdGenerator(), videoIdGenerator(), + videoCategoryTypeGenerator(), interactionTypeGenerator(), + watchTimeGenerator()); + } + + private void interactionIdGenerator() { + if (this.interactionId == this.maxInteractionId) { + this.interactionId = 0; + } else { + this.interactionId++; + } + } + private Long userIdGenerator() { + long leftLimit = 0L; + long rightLimit = 100L; + return leftLimit + (long) (Math.random() * (rightLimit - leftLimit)); + } + + private Long videoIdGenerator() { + long leftLimit = 0L; + long rightLimit = 100L; + return leftLimit + (long) (Math.random() * (rightLimit - leftLimit)); + } + + private String interactionTypeGenerator() { + return interactionTypes.get(randomNumber.nextInt(interactionTypes.size())); + } + + private Long videoCategoryTypeGenerator() { + return videoCategories.get(randomNumber.nextInt(interactionTypes.size())); + } + + private Long watchTimeGenerator() { + long leftLimit = 10L; + long rightLimit = 250; + return leftLimit + (long) (Math.random() * (rightLimit - leftLimit)); + } + + private void timestampGenerator(TikTokInteractions tikTokInteractions){ + //Long timestamp = Instant.now().toEpochMilli(); + this.startTime = this.startTime.plus(1, ChronoUnit.SECONDS); + tikTokInteractions.setInteractionDate(startTime.toEpochMilli()); + tikTokInteractions.setInteractionMonth(this.monthFormat.format(startTime.toEpochMilli())); + } + + private TikTokInteractions interactionEventGenerator(Long userId, Long videoId,Long videoCategory, + String interactionType, Long watchTime) { + + interactionIdGenerator(); + + TikTokInteractions tikTokInteractions = new TikTokInteractions(); + tikTokInteractions.setInteractionId(interactionId); + tikTokInteractions.setUserId(userId); + tikTokInteractions.setVideoId(videoId); + tikTokInteractions.setCategoryId(videoCategory); + tikTokInteractions.setInteractionType(interactionType); + tikTokInteractions.setWatchTime(watchTime); + timestampGenerator(tikTokInteractions); + return tikTokInteractions; + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSource.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSource.java new file mode 100644 index 00000000..94c7b46f --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSource.java @@ -0,0 +1,78 @@ +package ai.hopsworks.tutorials.flink.tiktok.utils; + +import ai.hopsworks.tutorials.flink.tiktok.features.SourceInteractions; +import lombok.SneakyThrows; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DecoderFactory; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; +import org.apache.flink.util.Collector; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.time.Instant; + +public class InteractionsEventKafkaSource implements KafkaDeserializationSchema, + KafkaRecordDeserializationSchema { + + @Override + public void open(DeserializationSchema.InitializationContext context) throws Exception { + KafkaRecordDeserializationSchema.super.open(context); + } + + @Override + public boolean isEndOfStream(TikTokInteractions sourceInteractions) { + return false; + } + + @Override + public TikTokInteractions deserialize(ConsumerRecord consumerRecord) throws Exception { + byte[] messageKey = consumerRecord.key(); + byte[] message = consumerRecord.value(); + long offset = consumerRecord.offset(); + long timestamp = consumerRecord.timestamp(); + + SourceInteractions sourceInteractions = new SourceInteractions(); + ByteArrayInputStream in = new ByteArrayInputStream(message); + DatumReader userDatumReader = new SpecificDatumReader<>(sourceInteractions.getSchema()); + BinaryDecoder decoder = DecoderFactory.get().directBinaryDecoder(in, null); + sourceInteractions = userDatumReader.read(null, decoder); + + TikTokInteractions interactions = getTikTokInteractions(sourceInteractions); + + return interactions; + } + + private static TikTokInteractions getTikTokInteractions(SourceInteractions sourceInteractions) { + TikTokInteractions interactions = new TikTokInteractions(); + interactions.setInteractionId(sourceInteractions.getId()); + interactions.setUserId(sourceInteractions.getUserId()); + interactions.setVideoId(sourceInteractions.getVideoId()); + interactions.setCategoryId(sourceInteractions.getCategoryId()); + interactions.setInteractionType(String.valueOf(sourceInteractions.getInteractionType())); + interactions.setInteractionDate(sourceInteractions.getInteractionDate()); + interactions.setInteractionMonth(String.valueOf(sourceInteractions.getInteractionMonth())); + interactions.setWatchTime(sourceInteractions.getWatchTime()); + return interactions; + } + + @SneakyThrows + @Override + public void deserialize(ConsumerRecord consumerRecord, Collector collector) + throws IOException { + long deserializeStart = Instant.now().toEpochMilli(); + TikTokInteractions sourceInteractions = deserialize(consumerRecord); + sourceInteractions.setProcessStart(deserializeStart); + collector.collect(sourceInteractions); + } + + @Override + public TypeInformation getProducedType() { + return TypeInformation.of(TikTokInteractions.class); + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSync.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSync.java new file mode 100644 index 00000000..0aabe058 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/InteractionsEventKafkaSync.java @@ -0,0 +1,67 @@ +package ai.hopsworks.tutorials.flink.tiktok.utils; + +import ai.hopsworks.tutorials.flink.tiktok.features.SourceInteractions; +import lombok.SneakyThrows; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.specific.SpecificDatumWriter; +import org.apache.flink.api.common.serialization.SerializationSchema; +import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema; +import org.apache.flink.runtime.metrics.DescriptiveStatisticsHistogram; +import org.apache.kafka.clients.producer.ProducerRecord; + +import javax.annotation.Nullable; +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.time.Instant; + +public class InteractionsEventKafkaSync implements KafkaRecordSerializationSchema { + + + private static final int EVENT_TIME_LAG_WINDOW_SIZE = 10_000; + + private transient DescriptiveStatisticsHistogram eventTimeLag; + + private final String topic; + + public InteractionsEventKafkaSync(String topic) { + this.topic = topic; + } + + @SneakyThrows + public byte[] serializeValue(SourceInteractions interactionEvent) { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null); + DatumWriter dataFileWriter = new SpecificDatumWriter<>(SourceInteractions.class); + dataFileWriter.write(interactionEvent, encoder); + encoder.flush(); + return out.toByteArray(); + } + + public byte[] serializeKey(SourceInteractions interactionEvent) { + return String.valueOf(interactionEvent.getUserId()).getBytes(StandardCharsets.UTF_8); + } + + @Override + public void open(SerializationSchema.InitializationContext context, KafkaSinkContext sinkContext) throws Exception { + KafkaRecordSerializationSchema.super.open(context, sinkContext); + eventTimeLag = + context + .getMetricGroup() + .histogram( + "interactionsEventKafkaSyncLag", + new DescriptiveStatisticsHistogram(EVENT_TIME_LAG_WINDOW_SIZE)); + } + @Nullable + @Override + public ProducerRecord serialize(SourceInteractions sourceInteractions, + KafkaSinkContext kafkaSinkContext, Long timestamp) { + byte[] key = this.serializeKey(sourceInteractions); + byte[] value = this.serializeValue(sourceInteractions); + eventTimeLag.update(Instant.now().toEpochMilli() - sourceInteractions.getInteractionDate()); + + return new ProducerRecord<>(topic, null, timestamp, key, value); + } + +} \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/TikTokInteractions.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/TikTokInteractions.java new file mode 100644 index 00000000..940a38f9 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/TikTokInteractions.java @@ -0,0 +1,86 @@ +package ai.hopsworks.tutorials.flink.tiktok.utils; + +public class TikTokInteractions { + private Long interactionId; + private Long userId; + private Long videoId; + private Long categoryId; + private String interactionType; + private Long watchTime; + private Long interactionDate; + private String interactionMonth; + private Long processStart; + + + public void setInteractionId(Long interactionId) { + this.interactionId = interactionId; + } + + public Long getInteractionId() { + return interactionId; + } + + public void setUserId(Long userId) { + this.userId = userId; + } + + public Long getUserId() { + return userId; + } + + public void setVideoId(Long videoId) { + this.videoId = videoId; + } + + public Long getVideoId() { + return videoId; + } + + public void setCategoryId(Long categoryId) { + this.categoryId = categoryId; + } + + public Long getCategoryId() { + return categoryId; + } + + public void setInteractionType(String interactionType) { + this.interactionType = interactionType; + } + + public String getInteractionType() { + return interactionType; + } + + public void setWatchTime(Long watchTime) { + this.watchTime = watchTime; + } + + public Long getWatchTime() { + return watchTime; + } + + public void setInteractionDate(Long interactionDate) { + this.interactionDate = interactionDate; + } + + public Long getInteractionDate() { + return interactionDate; + } + + public void setInteractionMonth(String interactionMonth) { + this.interactionMonth = interactionMonth; + } + + public String getInteractionMonth() { + return interactionMonth; + } + + public void setProcessStart(Long processStart) { + this.processStart = processStart; + } + + public Long getProcessStart() { + return processStart; + } +} diff --git a/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/Utils.java b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/Utils.java new file mode 100644 index 00000000..029ec4ee --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/java/src/main/java/ai/hopsworks/tutorials/flink/tiktok/utils/Utils.java @@ -0,0 +1,35 @@ +package ai.hopsworks.tutorials.flink.tiktok.utils; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.flink.HopsworksConnection; +import com.logicalclocks.hsfs.metadata.HopsworksClient; +import com.logicalclocks.hsfs.metadata.HopsworksHttpClient; + +import java.io.IOException; +import java.util.Properties; + +public class Utils { + + public Properties getKafkaProperties() throws FeatureStoreException, IOException { + HopsworksConnection connection = HopsworksConnection.builder().build(); + HopsworksHttpClient client = HopsworksClient.getInstance().getHopsworksHttpClient(); + Properties properties = new Properties(); + properties.put("bootstrap.servers", "broker.kafka.service.consul:9091"); + properties.put("security.protocol", "SSL"); + properties.put("ssl.truststore.location", client.getTrustStorePath()); + properties.put("ssl.truststore.password", client.getCertKey()); + properties.put("ssl.keystore.location", client.getKeyStorePath()); + properties.put("ssl.keystore.password", client.getCertKey()); + properties.put("ssl.key.password", client.getCertKey()); + properties.put("ssl.endpoint.identification.algorithm", ""); + properties.put("enable.idempotence", false); + return properties; + } + + public Properties getKafkaProperties(String topic) throws FeatureStoreException, IOException { + Properties properties = getKafkaProperties(); + properties.put("topic", topic); + return properties; + } + +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_a_user_fg_backfil.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_a_user_fg_backfil.ipynb new file mode 100644 index 00000000..a803da3c --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_a_user_fg_backfil.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74d2c263", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d06a1e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import great_expectations as ge\n", + "from great_expectations.core import ExpectationSuite, ExpectationConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "5fb0d84c", + "metadata": {}, + "source": [ + "## 👥 Fetch Users Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fce94dc8", + "metadata": {}, + "outputs": [], + "source": [ + "data_users_df = pd.read_parquet('https://repo.hops.works/dev/davit/tiktok_recsys/users.parquet')\n", + "data_users_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "022e16a5", + "metadata": {}, + "source": [ + "## 👮🏻‍♂️ Great Expectations " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b6548a", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Great Expectations DataFrame from the pandas DataFrame\n", + "ge_users_df = ge.from_pandas(data_users_df)\n", + "\n", + "# Initialize the expectation suite\n", + "expectation_suite_users = ge_users_df.get_expectation_suite()\n", + "expectation_suite_users.expectation_suite_name = \"user_data_suite\"\n", + "\n", + "# Expectation: Age should be between 0 and 120\n", + "expectation_suite_users.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\"column\": \"age\", \"min_value\": 12, \"max_value\": 100}\n", + " )\n", + ")\n", + "\n", + "# Expectations: Columns should not have null values\n", + "for column in ge_users_df.columns:\n", + " expectation_suite_users.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": column}\n", + " )\n", + " )\n", + "\n", + "# Expectation: Gender should only contain specific values\n", + "expectation_suite_users.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_distinct_values_to_be_in_set\",\n", + " kwargs={\"column\": \"gender\", \"value_set\": [\"Male\", \"Female\", \"Other\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "06368b8b", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602d3d75", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "5a19a172", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee7dee55", + "metadata": {}, + "outputs": [], + "source": [ + "users_fg = fs.get_or_create_feature_group(\n", + " name=\"users\",\n", + " version=1,\n", + " description=\"Users data.\",\n", + " primary_key=[\"user_id\"],\n", + " partition_key=[\"registration_month\"],\n", + " event_time=\"registration_date\",\n", + " online_enabled=True,\n", + " expectation_suite=expectation_suite_users,\n", + " statistics_config = {\n", + " \"enabled\": True,\n", + " \"histograms\": True,\n", + " \"correlations\": True,\n", + " } \n", + ")\n", + "\n", + "users_fg.insert(data_users_df)\n", + "print('Done ✅')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4548f55d-8941-4655-992c-7b672c0942b2", + "metadata": {}, + "outputs": [], + "source": [ + "feature_descriptions = [\n", + " {\"name\": \"user_id\", \"description\": \"Unique identifier for each user.\"},\n", + " {\"name\": \"gender\", \"description\": \"Gender of the user.\"},\n", + " {\"name\": \"age\", \"description\": \"Age of the user.\"},\n", + " {\"name\": \"country\", \"description\": \"Country of Residence of the user.\"},\n", + " {\"name\": \"registration_date\", \"description\": \"Date of registration.\"},\n", + " {\"name\": \"registration_month\", \"description\": \"Month of registration derived from registration_date.\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " users_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "6bcc04ea", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_b_video_fg_backfil.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_b_video_fg_backfil.ipynb new file mode 100644 index 00000000..4cb01ad3 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_b_video_fg_backfil.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74d2c263", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d06a1e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import great_expectations as ge\n", + "from great_expectations.core import ExpectationSuite, ExpectationConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "538080dd", + "metadata": {}, + "source": [ + "## 🎥 Fetch Content Data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e2188c", + "metadata": {}, + "outputs": [], + "source": [ + "data_video_df = pd.read_parquet('https://repo.hops.works/dev/davit/tiktok_recsys/videos.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "022e16a5", + "metadata": {}, + "source": [ + "## 👮🏻‍♂️ Great Expectations " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b5cc2d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Great Expectations DataFrame from the pandas DataFrame\n", + "ge_video_df = ge.from_pandas(data_video_df)\n", + "\n", + "# Initialize the expectation suite\n", + "expectation_suite_videos = ge_video_df.get_expectation_suite()\n", + "expectation_suite_videos.expectation_suite_name = \"video_data_suite\"\n", + "\n", + "# Expectation: Views, Likes, and Video Length should be non-negative\n", + "for column in [\"video_length\"]:\n", + " expectation_suite_videos.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\"column\": column, \"min_value\": 0, \"max_value\": None}\n", + " )\n", + " )\n", + "\n", + "# Expectation: Valid date format for upload_date\n", + "expectation_suite_videos.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_dateutil_parseable\",\n", + " kwargs={\"column\": \"upload_date\"}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "06368b8b", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602d3d75", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "5a19a172", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1612d635", + "metadata": {}, + "outputs": [], + "source": [ + "videos_fg = fs.get_or_create_feature_group(\n", + " name=\"videos\",\n", + " version=1,\n", + " description=\"Videos data.\",\n", + " primary_key=[\"video_id\"],\n", + " partition_key=[\"upload_month\"],\n", + " online_enabled=True,\n", + " event_time=\"upload_date\",\n", + " expectation_suite=expectation_suite_videos,\n", + " statistics_config = {\n", + " \"enabled\": True,\n", + " \"histograms\": True,\n", + " \"correlations\": True,\n", + " }\n", + ")\n", + "\n", + "videos_fg.insert(data_video_df)\n", + "print('Done ✅')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85b5390b-02a3-4a18-b08f-d6d910115464", + "metadata": {}, + "outputs": [], + "source": [ + "feature_descriptions = [\n", + " {\"name\": \"video_id\", \"description\": \"Identifier for the video.\"},\n", + " {\"name\": \"category_id\", \"description\": \"Id of the video category.\"}, \n", + " {\"name\": \"category\", \"description\": \"Name of the video category.\"},\n", + " {\"name\": \"video_length\", \"description\": \"Video length in sconds.\"},\n", + " {\"name\": \"upload_date\", \"description\": \"Date of upload for the video.\"},\n", + " {\"name\": \"upload_month\", \"description\": \"Month of upload for the video, derived from upload_date.\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " videos_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "6bcc04ea", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_c_interactions_fg_backfil.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_c_interactions_fg_backfil.ipynb new file mode 100644 index 00000000..7c180dcb --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_c_interactions_fg_backfil.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74d2c263", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d06a1e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime, timezone" + ] + }, + { + "cell_type": "markdown", + "id": "a2b34c80", + "metadata": {}, + "source": [ + "## 🔗 Fetch historical interactions dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69b8bd5e-2ede-40f0-af3d-2829c0e46790", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch historical interactions dataset and backfill interactions feature group\n", + "data_interactions_df = pd.read_parquet('https://repo.hops.works/dev/davit/tiktok_recsys/interactions.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "06368b8b", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602d3d75", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "5a19a172", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "239ed3c2-a9f1-4cef-a36f-fe48daa8ddd1", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg = fs.get_or_create_feature_group(\n", + " name=\"interactions\",\n", + " version=1,\n", + " description=\"Interactions data.\", \n", + " primary_key=[\"interaction_id\", \"user_id\", \"video_id\"],\n", + " partition_key = [\"interaction_month\"],\n", + " online_enabled=True,\n", + " event_time=\"interaction_date\",\n", + " statistics_config = {\n", + " \"enabled\": True,\n", + " \"histograms\": True,\n", + " \"correlations\": True,\n", + " }\n", + ")\n", + "\n", + "interactions_fg.insert(data_interactions_df)\n", + "print('Done ✅')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198209c2-9c8c-45d7-b568-dc44174ad684", + "metadata": {}, + "outputs": [], + "source": [ + "feature_descriptions = [\n", + " {\"name\": \"id\", \"description\": \"Unique id for the interaction\"},\n", + " {\"name\": \"user_id\", \"description\": \"Unique identifier for each user.\"},\n", + " {\"name\": \"video_id\", \"description\": \"Identifier for the video.\"},\n", + " {\"name\": \"category_id\", \"description\": \"Id of the video category.\"},\n", + " {\"name\": \"interaction_type\", \"description\": \"Type of interaction\"},\n", + " {\"name\": \"watch_time\", \"description\": \"Time in seconds how long user watched the video.\"},\n", + " {\"name\": \"interaction_date\", \"description\": \"Date of inteaction.\"},\n", + " {\"name\": \"interaction_month\", \"description\": \"Month of interaction, derived from interaction_date.\"}\n", + "]\n", + "\n", + "for desc in feature_descriptions:\n", + " interactions_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fefa5a02-aad7-4649-b71e-104b2ea53b4d", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg.materialization_job.schedule(cron_expression=\"0 */15 * ? * *\",\n", + " start_time=datetime.now(tz=timezone.utc))\n" + ] + }, + { + "cell_type": "markdown", + "id": "6bcc04ea", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_d_video_window_agg_feature_group.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_d_video_window_agg_feature_group.py new file mode 100644 index 00000000..5a03202a --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_d_video_window_agg_feature_group.py @@ -0,0 +1,62 @@ +import hopsworks + +from hsfs.feature import Feature +from datetime import datetime, timedelta, timezone + +project = hopsworks.login() +fs = project.get_feature_store() + +features = [ + Feature(name="video_id", type="bigint"), + Feature(name="category_id", type="bigint"), + + Feature(name="like_count", type="bigint"), + Feature(name="dislike_count", type="bigint"), + Feature(name="view_count", type="bigint"), + Feature(name="comment_count", type="bigint"), + Feature(name="share_count", type="bigint"), + Feature(name="skip_count", type="bigint"), + Feature(name="total_watch_time", type="bigint"), + + Feature(name="interaction_month", type="string"), + Feature(name="window_end_time", type="timestamp"), +] + +video_window_agg_1h_fg = fs.create_feature_group( + "video_window_agg_1h", + version=1, + primary_key=["video_id"], + partition_key=["interaction_month"], + event_time="window_end_time", + online_enabled=True, + stream=True, + statistics_config = { + "enabled": True, + "histograms": True, + "correlations": True, + } +) + +video_window_agg_1h_fg.save(features) + +video_window_agg_1h_fg.materialization_job.schedule(cron_expression="0 */15 * ? * *", + start_time=datetime.now(tz=timezone.utc)) + +feature_descriptions = [ + {"name": "video_id", "description": "Identifier for the video."}, + {"name": "category_id", "description": "Id of the video category."}, + {"name": "window_end_time", "description": "End of the specified time window where interaction were aggregated."}, + {"name": "interaction_month", + "description": "Month of the end of the specified time window where interaction were aggregated. Derived from window_end_time"}, + {"name": "like_count", "description": "Number of likes video got over a specified time window."}, + {"name": "dislike_count", "description": "Number of dislikes video got over a specified time window."}, + {"name": "view_count", "description": "Number of views video got over a specified time window."}, + {"name": "comment_count", "description": "Number of comments video got over a specified time window."}, + {"name": "share_count", "description": "Number of likes over got over a specified time window."}, + {"name": "skip_count", "description": "Number of times video was skiped over a specified time window."}, + {"name": "total_watch_time", + "description": "Total time in seconds video was watched over a specified time window."}, +] + +for desc in feature_descriptions: + video_window_agg_1h_fg.update_feature_description(desc["name"], desc["description"]) diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_e_interactions_month_sincos_fg.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_e_interactions_month_sincos_fg.ipynb new file mode 100644 index 00000000..7fa569bd --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_e_interactions_month_sincos_fg.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74d2c263", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d06a1e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime, timezone\n", + "\n", + "from features.interactions import month_sine, month_cosine" + ] + }, + { + "cell_type": "markdown", + "id": "06368b8b", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602d3d75", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "8840b016-2f1f-4db8-a831-b76ae42cf561", + "metadata": {}, + "source": [ + "## 🔗 Fetch interactions feature group " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fae1dde-ebe8-44ef-839b-6fe691ed405b", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg = fs.get_feature_group(\n", + " name=\"interactions\",\n", + " version=1)\n", + " \n", + "data_interactions_df = interactions_fg.read()\n", + "data_interactions_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be778ba1-edd9-4d3e-912c-fbf6d30c2b84", + "metadata": {}, + "outputs": [], + "source": [ + "data_interactions_df = data_interactions_df[[\"id\", \"interaction_date\", \"interaction_month\"]]\n", + "# Calculate the sine and cosine components for the month_of_purchase\n", + "data_interactions_df[\"month_sin\"] = data_interactions_df.interaction_date.map(lambda x: month_sine(x))\n", + "data_interactions_df[\"month_cos\"] = data_interactions_df.interaction_date.map(lambda x: month_cosine(x)) \n", + "data_interactions_df " + ] + }, + { + "cell_type": "markdown", + "id": "5a19a172", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "239ed3c2-a9f1-4cef-a36f-fe48daa8ddd1", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_month_sincos_fg = fs.get_or_create_feature_group(\n", + " name=\"interactions_month_sincos\",\n", + " version=1,\n", + " description=\"Ondeamand Features for Interactions data such month sine and cosine.\", \n", + " primary_key=[\"id\"],\n", + " partition_key = [\"interaction_month\"],\n", + " online_enabled=True,\n", + " event_time=\"interaction_date\",\n", + " parents=[interactions_fg],\n", + " statistics_config = {\n", + " \"enabled\": True,\n", + " \"histograms\": True,\n", + " \"correlations\": True,\n", + " } \n", + ")\n", + "\n", + "interactions_month_sincos_fg.insert(data_interactions_df)\n", + "print('Done ✅')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198209c2-9c8c-45d7-b568-dc44174ad684", + "metadata": {}, + "outputs": [], + "source": [ + "feature_descriptions = [\n", + " {\"name\": \"id\", \"description\": \"Unique id for the interaction\"},\n", + " {\"name\": \"month_sin\", \"description\": \"Sine of the month of interaction date.\"},\n", + " {\"name\": \"month_cos\", \"description\": \"Cosine of the month of interaction date.\"}, \n", + " {\"name\": \"interaction_date\", \"description\": \"Date of inteaction.\"},\n", + " {\"name\": \"interaction_month\", \"description\": \"Month of interaction, derived from interaction_date.\"}\n", + "]\n", + "\n", + "for desc in feature_descriptions:\n", + " interactions_month_sincos_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "6bcc04ea", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_f_user_window_agg_feature_group.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_f_user_window_agg_feature_group.py new file mode 100644 index 00000000..84aff12d --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_f_user_window_agg_feature_group.py @@ -0,0 +1,69 @@ +import hopsworks + +from hsfs.feature import Feature +from datetime import datetime, timedelta, timezone + +project = hopsworks.login() +fs = project.get_feature_store() + +features = [ + Feature(name="user_id", type="bigint"), + Feature(name="category_id", type="bigint"), + + Feature(name="like_count", type="bigint"), + Feature(name="dislike_count", type="bigint"), + Feature(name="view_count", type="bigint"), + Feature(name="comment_count", type="bigint"), + Feature(name="share_count", type="bigint"), + Feature(name="skip_count", type="bigint"), + Feature(name="total_watch_time", type="bigint"), + + Feature(name="interaction_month", type="string"), + Feature(name="window_end_time", type="timestamp"), +] + +user_window_agg_1h_fg = fs.create_feature_group( + "user_window_agg_1h", + version=1, + primary_key=["user_id"], + partition_key=["interaction_month"], + event_time="window_end_time", + online_enabled=True, + stream=True, + statistics_config = { + "enabled": True, + "histograms": True, + "correlations": True, + } +) + +user_window_agg_1h_fg.save(features) + +user_window_agg_1h_fg.materialization_job.schedule(cron_expression="0 */15 * ? * *", + start_time=datetime.now(tz=timezone.utc)) + +feature_descriptions = [ + {"name": "user_id", "description": "Unique identifier for each user."}, + {"name": "category_id", "description": "Id of the video category."}, + {"name": "window_end_time", "description": "End of the specified time window where interaction were aggregated."}, + {"name": "interaction_month", + "description": "Month of the end of the specified time window where interaction were aggregated. Derived from window_end_time"}, + {"name": "like_count", + "description": "Number of likes video category got from the user during a specified time window."}, + {"name": "dislike_count", + "description": "Number of dislikes video category got from the user during a specified time window."}, + {"name": "view_count", + "description": "Number of views over video category got from the user during a specified time window."}, + {"name": "comment_count", + "description": "Number of comments video category got from the user during a specified time window."}, + {"name": "share_count", + "description": "Number of likes over video category got from the user during a specified time window."}, + {"name": "skip_count", + "description": "Number of times video category was skiped by the user during a specified time window."}, + {"name": "total_watch_time", + "description": "Total time in seconds video category was watched by the user during a specified time window."}, +] + +for desc in feature_descriptions: + user_window_agg_1h_fg.update_feature_description(desc["name"], desc["description"]) + diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/1_g_ranking_fg.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_g_ranking_fg.ipynb new file mode 100644 index 00000000..767e6b2d --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/1_g_ranking_fg.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74d2c263", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d06a1e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "06368b8b", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602d3d75", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "5a19a172", + "metadata": {}, + "source": [ + "## 🪄 Fetch Feature Groups " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee7dee55", + "metadata": {}, + "outputs": [], + "source": [ + "users_fg = fs.get_feature_group(\n", + " name=\"users\",\n", + " version=1,\n", + ")\n", + "\n", + "videos_fg = fs.get_feature_group(\n", + " name=\"videos\",\n", + " version=1,\n", + ")\n", + "\n", + "interactions_fg = fs.get_feature_group(\n", + " name=\"interactions\",\n", + " version=1,\n", + ")\n", + "\n", + "interactions_month_sincos_fg = fs.get_feature_group(\n", + " name=\"interactions_month_sincos\",\n", + " version=1,\n", + ")\n", + "\n", + "query = interactions_fg.select_except(['interaction_id', 'watch_time', 'interaction_date', 'category_id'])\\\n", + " .join(interactions_month_sincos_fg.select(['month_cos', 'month_sin']))\\\n", + " .join(users_fg.select_except(['upload_date', 'upload_month'])) \\\n", + " .join(videos_fg.select_except(['registration_date', 'registration_month']))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2463f23e-3640-4718-90de-0a9a394f821b", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_df = query.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba00730-3bfc-4aff-9473-8e9b50427326", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_df['label'] = np.where(\n", + " ranking_df.interaction_type.isin(['view', 'like', 'share', 'comment']), \n", + " 1, \n", + " 0,\n", + ")\n", + "\n", + "ranking_df = ranking_df[[\"user_id\", \"video_id\", \"category_id\", \"interaction_month\", \"video_length\", \"gender\", \"age\", \"country\", \"label\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "20a31c2b", + "metadata": {}, + "source": [ + "## 🪄 Ranking Feature Group " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08a30c03", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_fg = fs.get_or_create_feature_group(\n", + " name=\"ranking\",\n", + " version=1,\n", + " description=\"Ranking Data.\",\n", + " primary_key=[\"user_id\", \"video_id\"],\n", + " partition_key = [\"interaction_month\"], \n", + " online_enabled=False, \n", + " parents=[users_fg, videos_fg, interactions_fg, interactions_month_sincos_fg],\n", + " statistics_config = {\n", + " \"enabled\": True,\n", + " \"histograms\": True,\n", + " \"correlations\": True,\n", + " }\n", + ")\n", + "\n", + "ranking_fg.insert(ranking_df)\n", + "print('Done ✅')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "745641e4-75e8-43cb-b7f8-41a1c994c673", + "metadata": {}, + "outputs": [], + "source": [ + "feature_descriptions = [\n", + " {\"name\": \"user_id\", \"description\": \"Unique identifier for each user.\"},\n", + " {\"name\": \"video_id\", \"description\": \"Identifier for the video.\"},\n", + " {\"name\": \"category_id\", \"description\": \"Id of the video category.\"}, \n", + " {\"name\": \"interaction_month\", \"description\": \"Month of interaction, derived from interaction_date.\"}, \n", + " {\"name\": \"video_length\", \"description\": \"Video length in sconds.\"},\n", + " {\"name\": \"gender\", \"description\": \"Gender of the user.\"},\n", + " {\"name\": \"age\", \"description\": \"Age of the user.\"},\n", + " {\"name\": \"country\", \"description\": \"Country of Residence of the user.\"},\n", + " {\"name\": \"label\", \"description\": \"Label indicating whether the article was purchased (1) or not (0).\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " ranking_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "6bcc04ea", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/2_retrieval_model_training.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/2_retrieval_model_training.ipynb new file mode 100644 index 00000000..05b18bd9 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/2_retrieval_model_training.ipynb @@ -0,0 +1,772 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a802df49", + "metadata": {}, + "source": [ + "## 🧬 Train Retrieval Model \n" + ] + }, + { + "cell_type": "markdown", + "id": "ad717bc4", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfa1ec75", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.keras.layers import StringLookup, Normalization\n", + "\n", + "import tensorflow_recommenders as tfrs\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "27878d7f", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cde7ee5a", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "4a0aa637", + "metadata": {}, + "source": [ + "## 🔪 Feature Selection \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9563ca98", + "metadata": {}, + "outputs": [], + "source": [ + "users_fg = fs.get_feature_group(\n", + " name=\"users\",\n", + " version=1,\n", + ")\n", + "\n", + "videos_fg = fs.get_feature_group(\n", + " name=\"videos\",\n", + " version=1,\n", + ")\n", + "\n", + "interactions_fg = fs.get_feature_group(\n", + " name=\"interactions\",\n", + " version=1,\n", + ")\n", + "\n", + "interactions_fg = fs.get_feature_group(\n", + " name=\"interactions\",\n", + " version=1,\n", + ")\n", + "\n", + "interactions_month_sincos_fg = fs.get_feature_group(\n", + " name=\"interactions_month_sincos\",\n", + " version=1,\n", + ")\n", + "\n", + "user_window_agg_1h_fg = fs.get_feature_group(\n", + " name=\"user_window_agg_1h\",\n", + " version=1,\n", + ")\n", + "\n", + "video_window_agg_1h_fg = fs.get_feature_group(\n", + " name=\"video_window_agg_1h\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7472aa57", + "metadata": {}, + "outputs": [], + "source": [ + "QUERY_FEATURES = [\"user_id\", \"gender\", \"age\", \"country\"] \n", + "QUERY_AGG_FEATURES =[\"like_count\", \"dislike_count\", \"view_count\", \"total_watch_time\"]\n", + "\n", + "CANDIDATE_FEATURES = [\"video_id\", \"category\", \"video_length\"]\n", + "CANDIDATE_AGG_FEATURES = [\"like_count\", \"dislike_count\", \"view_count\", \"total_watch_time\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05a7017a", + "metadata": {}, + "outputs": [], + "source": [ + "# Select features for training data\n", + "selected_features = interactions_fg.select([\"id\"])\\\n", + " .join(users_fg.select(QUERY_FEATURES), on=\"user_id\")\\\n", + " .join(videos_fg.select(CANDIDATE_FEATURES), on=\"video_id\")\\\n", + " .join(video_window_agg_1h_fg.select(CANDIDATE_AGG_FEATURES), on=\"video_id\", prefix= \"vid_\")\\\n", + " .join(user_window_agg_1h_fg.select(QUERY_AGG_FEATURES), on=[\"user_id\", \"category_id\"], prefix= \"user_\")\\\n", + " .join(interactions_month_sincos_fg.select([\"month_sin\", \"month_cos\"]), on=\"id\")\n", + "\n", + "# Uncomment this if you would like to view your selected features\n", + "#selected_features.show(5)" + ] + }, + { + "cell_type": "markdown", + "id": "041203aa", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "464a7b55", + "metadata": {}, + "outputs": [], + "source": [ + "feature_view = fs.get_or_create_feature_view(\n", + " name='retrieval',\n", + " version=1,\n", + " query=selected_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca2ba800", + "metadata": {}, + "source": [ + "## 🏋️ Training Dataset \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7461e500", + "metadata": {}, + "outputs": [], + "source": [ + "train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(\n", + " validation_size=0.1, \n", + " test_size=0.1,\n", + " description='Retrieval dataset splits',\n", + ")\n", + "train_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "633b4c65", + "metadata": {}, + "source": [ + "You will train your retrieval model with a subset of features.\n", + "\n", + "For the query embedding you will use:\n", + "- `user_id`: ID of a user.\n", + "- `gender`: Gender of a user.\n", + "- `age`: age of a user.\n", + "- `country`: country if a user.\n", + "- `month_sin`: Sine of the month of interaction date.\n", + "- `month_cos`: Cosine of the month of interaction date.\n", + "- `user_like_count`: Number of times user liked video catregory.\n", + "- `user_dislike_count`: Number of times user disliked video catregory.\n", + "- `user_view_count`: Number of times user viewed video catregory.\n", + "- `user_total_watch_time` : Total time in seconds user watched video catregory.;\n", + "\n", + "For the candidate embedding you will use:\n", + "- `video_id`: ID of a video.\n", + "- `category`: Video Category.\n", + "- `vid_like_count`: Number of times video was liked by users.\n", + "- `vid_dislike_count`: Number of times video was disliked by users.\n", + "- `vid_view_count`: Number of times video was viewed by users.\n", + "- `vid_total_watch_time`: Total time in seconds video was watched. \n", + "- `video_length`: Length of video.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4588de29", + "metadata": {}, + "outputs": [], + "source": [ + "def df_to_ds(df):\n", + " return tf.data.Dataset.from_tensor_slices({col: df[col] for col in df})\n", + "\n", + "BATCH_SIZE = 2048\n", + "train_ds = df_to_ds(train_df).batch(BATCH_SIZE).cache().shuffle(BATCH_SIZE*10)\n", + "val_ds = df_to_ds(val_df).batch(BATCH_SIZE).cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39e001ab", + "metadata": {}, + "outputs": [], + "source": [ + "train_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "414f2cf6", + "metadata": {}, + "outputs": [], + "source": [ + "# Query Features \n", + "user_id_list = train_df[\"user_id\"].unique().tolist()\n", + "countries_list = train_df[\"country\"].unique().tolist()\n", + "gender_list = train_df[\"gender\"].unique().tolist()\n", + "\n", + "# Item Features\n", + "video_id_list = train_df[\"video_id\"].unique().tolist()\n", + "category_list = train_df[\"category\"].unique().tolist()\n", + "\n", + "print(f\"⛳️ Number of users: {len(user_id_list)}\")\n", + "print(f\"⛳️ Number of items: {len(video_id_list)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "56e9f3ce", + "metadata": {}, + "source": [ + "## 🏰 Two Tower Model \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91cd9402", + "metadata": {}, + "outputs": [], + "source": [ + "EMB_DIM = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5214794e", + "metadata": {}, + "outputs": [], + "source": [ + "class QueryTower(tf.keras.Model):\n", + "\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " self.emb_dim = EMB_DIM\n", + " self.user_embedding = tf.keras.Sequential([\n", + " StringLookup(\n", + " vocabulary=user_id_list,\n", + " mask_token=None\n", + " ),\n", + " tf.keras.layers.Embedding(\n", + " # You add an additional embedding to account for unknown tokens.\n", + " len(user_id_list) + 1,\n", + " self.emb_dim\n", + " )\n", + " ])\n", + "\n", + " self.normalized_age = Normalization(axis=None)\n", + " self.normalized_sin = Normalization(axis=None)\n", + " self.normalized_cos = Normalization(axis=None)\n", + " \n", + " # Converts strings into integer indices (scikit-learn LabelEncoder analog)\n", + " self.gender_tokenizer = StringLookup(\n", + " vocabulary=gender_list,\n", + " mask_token=None,\n", + " )\n", + " \n", + " self.country_tokenizer = StringLookup(\n", + " vocabulary=countries_list, \n", + " mask_token=None,\n", + " )\n", + "\n", + " self.normalized_views = Normalization(axis=None)\n", + " self.normalized_dislikes = Normalization(axis=None)\n", + " self.normalized_likes = Normalization(axis=None)\n", + " self.normalized_video_watch_time = Normalization(axis=None)\n", + " \n", + " self.fnn = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(self.emb_dim, activation=\"relu\"),\n", + " tf.keras.layers.Dense(self.emb_dim)\n", + " ])\n", + "\n", + " def call(self, inputs):\n", + " gender_embedding = tf.one_hot(\n", + " self.gender_tokenizer(inputs[\"gender\"]),\n", + " len(gender_list),\n", + " )\n", + " \n", + " country_embedding = tf.one_hot(\n", + " self.country_tokenizer(inputs[\"country\"]),\n", + " len(countries_list),\n", + " )\n", + " \n", + " concatenated_inputs = tf.concat([\n", + " self.user_embedding(inputs[\"user_id\"]),\n", + " tf.reshape(self.normalized_age(inputs[\"age\"]), (-1,1)),\n", + " tf.reshape(self.normalized_sin(inputs[\"month_sin\"]), (-1,1)),\n", + " tf.reshape(self.normalized_cos(inputs[\"month_cos\"]), (-1,1)),\n", + "\n", + " tf.reshape(self.normalized_views(inputs[\"user_view_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_dislikes(inputs[\"user_dislike_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_likes(inputs[\"user_like_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_video_watch_time(inputs[\"user_total_watch_time\"]), (-1,1)),\n", + " \n", + " gender_embedding,\n", + " country_embedding,\n", + " ], axis=1)\n", + "\n", + " outputs = self.fnn(concatenated_inputs)\n", + "\n", + " return outputs\n", + "\n", + "\n", + "query_model = QueryTower()\n", + "\n", + "query_model.normalized_age.adapt(train_ds.map(lambda x : x[\"age\"]))\n", + "\n", + "# Initialize model with inputs.\n", + "query_df = train_df[QUERY_FEATURES + [\"user_\" + i for i in QUERY_AGG_FEATURES] + [\"month_sin\", \"month_cos\"]] \n", + "query_ds = df_to_ds(query_df).batch(1)\n", + "query_model(next(iter(query_ds)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b8381c8", + "metadata": {}, + "outputs": [], + "source": [ + "class ItemTower(tf.keras.Model):\n", + "\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " self.emb_dim = EMB_DIM\n", + " self.video_embedding = tf.keras.Sequential([\n", + " StringLookup(\n", + " vocabulary=video_id_list,\n", + " mask_token=None\n", + " ),\n", + " tf.keras.layers.Embedding(\n", + " # You add an additional embedding to account for unknown tokens.\n", + " len(video_id_list) + 1,\n", + " self.emb_dim,\n", + " )\n", + " ])\n", + " \n", + " # Converts strings into integer indices (scikit-learn LabelEncoder analog)\n", + " self.category_tokenizer = StringLookup(\n", + " vocabulary=category_list, \n", + " mask_token=None,\n", + " )\n", + " \n", + " self.normalized_views = Normalization(axis=None)\n", + " self.normalized_dislikes = Normalization(axis=None)\n", + " self.normalized_likes = Normalization(axis=None)\n", + " self.normalized_video_length = Normalization(axis=None)\n", + " self.normalized_video_watch_time = Normalization(axis=None)\n", + "\n", + " self.fnn = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(self.emb_dim, activation=\"relu\"),\n", + " tf.keras.layers.Dense(self.emb_dim)\n", + " ])\n", + "\n", + " def call(self, inputs):\n", + " category_embedding = tf.one_hot(\n", + " self.category_tokenizer(inputs[\"category\"]),\n", + " len(category_list),\n", + " )\n", + "\n", + " concatenated_inputs = tf.concat([\n", + " self.video_embedding(inputs[\"video_id\"]),\n", + " category_embedding,\n", + " tf.reshape(self.normalized_views(inputs[\"vid_view_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_dislikes(inputs[\"vid_dislike_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_likes(inputs[\"vid_like_count\"]), (-1,1)),\n", + " tf.reshape(self.normalized_video_length(inputs[\"video_length\"]), (-1,1)),\n", + " tf.reshape(self.normalized_video_watch_time(inputs[\"vid_total_watch_time\"]), (-1,1)),\n", + " ], axis=1)\n", + "\n", + " outputs = self.fnn(concatenated_inputs)\n", + "\n", + " return outputs\n", + "\n", + " \n", + "item_model = ItemTower()\n", + "\n", + "item_df = train_df[CANDIDATE_FEATURES+ [\"vid_\" + i for i in CANDIDATE_AGG_FEATURES]]\n", + "item_df.drop_duplicates(subset=\"video_id\", inplace=True)\n", + "item_ds = df_to_ds(item_df)\n", + "\n", + "item_model(next(iter(item_ds.batch(1))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319705d8", + "metadata": {}, + "outputs": [], + "source": [ + "class TwoTowerModel(tf.keras.Model):\n", + " def __init__(self, query_model, item_model):\n", + " super().__init__()\n", + " self.query_model = query_model\n", + " self.item_model = item_model\n", + " self.task = tfrs.tasks.Retrieval(\n", + " metrics=tfrs.metrics.FactorizedTopK(\n", + " candidates=item_ds.batch(BATCH_SIZE).map(self.item_model)\n", + " )\n", + " )\n", + "\n", + " def train_step(self, batch) -> tf.Tensor:\n", + " # Set up a gradient tape to record gradients.\n", + " with tf.GradientTape() as tape:\n", + "\n", + " # Loss computation.\n", + " user_embeddings = self.query_model(batch)\n", + " item_embeddings = self.item_model(batch)\n", + " loss = self.task(\n", + " user_embeddings, \n", + " item_embeddings,\n", + " compute_metrics=False,\n", + " )\n", + "\n", + " # Handle regularization losses as well.\n", + " regularization_loss = sum(self.losses)\n", + "\n", + " total_loss = loss + regularization_loss\n", + "\n", + " gradients = tape.gradient(total_loss, self.trainable_variables)\n", + " self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))\n", + "\n", + " metrics = {\n", + " \"loss\": loss,\n", + " \"regularization_loss\": regularization_loss,\n", + " \"total_loss\": total_loss\n", + " }\n", + "\n", + " return metrics\n", + "\n", + " def test_step(self, batch) -> tf.Tensor:\n", + " # Loss computation.\n", + " user_embeddings = self.query_model(batch)\n", + " item_embeddings = self.item_model(batch)\n", + "\n", + " loss = self.task(\n", + " user_embeddings, \n", + " item_embeddings,\n", + " compute_metrics=False,\n", + " )\n", + "\n", + " # Handle regularization losses as well.\n", + " regularization_loss = sum(self.losses)\n", + "\n", + " total_loss = loss + regularization_loss\n", + "\n", + " metrics = {metric.name: metric.result() for metric in self.metrics}\n", + " metrics[\"loss\"] = loss\n", + " metrics[\"regularization_loss\"] = regularization_loss\n", + " metrics[\"total_loss\"] = total_loss\n", + "\n", + " return metrics" + ] + }, + { + "cell_type": "markdown", + "id": "e0aa3b5a", + "metadata": {}, + "source": [ + "### 🏃🏻‍♂️ Model Training \n", + "\n", + "You'll train our model using the AdamW optimizer, which applies weight regularization during training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a73e2a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a TwoTowerModel with the specified query_model and item_model\n", + "model = TwoTowerModel(query_model, item_model)\n", + "\n", + "# Define an optimizer using AdamW with a learning rate of 0.01\n", + "#optimizer = tf.keras.optimizers.AdamW(\n", + "optimizer = tf.keras.optimizers.Adam(\n", + " weight_decay=0.001, \n", + " learning_rate=0.01,\n", + ")\n", + "\n", + "# Compile the model using the specified optimizer\n", + "model.compile(optimizer=optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ae7140f", + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(\n", + " train_ds, \n", + " validation_data=val_ds, \n", + " epochs=5,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "75379fa6", + "metadata": {}, + "source": [ + "## 🗄️ Upload Model to Model Registry \n", + "\n", + "One of the features in Hopsworks is the model registry. This is where you can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.\n", + "\n", + "Let's connect to the model registry using the [HSML library](https://docs.hopsworks.ai/machine-learning-api/latest) from Hopsworks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37b0582a", + "metadata": {}, + "outputs": [], + "source": [ + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04743dca", + "metadata": {}, + "outputs": [], + "source": [ + "class QueryModelModule(tf.Module):\n", + " def __init__(self, query_model):\n", + " self.query_model = query_model\n", + "\n", + " @tf.function()\n", + " def compute_emb(self, instances):\n", + " query_emb = self.query_model(instances)\n", + " return {\n", + " \"user_id\": instances[\"user_id\"],\n", + " \"gender\": instances[\"gender\"],\n", + " \"age\": instances[\"age\"],\n", + " \"country\": instances[\"country\"],\n", + " \"month_sin\": instances[\"month_sin\"],\n", + " \"month_cos\": instances[\"month_cos\"], \n", + " \"query_emb\": query_emb,\n", + " }\n", + "\n", + "# wrap query_model: query_model -> query_model_module\n", + "query_model = QueryModelModule(model.query_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac391bfa", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the input specifications for the instances\n", + "instances_spec = {\n", + " 'user_id': tf.TensorSpec(shape=(None,), dtype=tf.string, name='user_id'), # Specification for user IDs\n", + " 'gender': tf.TensorSpec(shape=(None,), dtype=tf.string, name='gender'), # Specification for gender\n", + " 'country': tf.TensorSpec(shape=(None,), dtype=tf.string, name='country'), # Specification for country\n", + " 'age': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='age'), # Specification for age\n", + " 'user_view_count': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_view_count'), # Specification for view_count\n", + " 'user_dislike_count': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_dislike_count'), # Specification for dislike_count\n", + " 'month_sin' : tf.TensorSpec(shape=(None,), dtype=tf.float32, name='month_sin'), # Specification for month_sin\n", + " 'month_cos' : tf.TensorSpec(shape=(None,), dtype=tf.float32, name='month_cos'), # Specification for month_cos\n", + " 'user_view_count': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_view_count'), # Specification for view_count\n", + " 'user_dislike_count': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_dislike_count'), # Specification for dislike_count\n", + " 'user_like_count': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_like_count'), # Specification for like_count\n", + " 'user_total_watch_time': tf.TensorSpec(shape=(None,), dtype=tf.int64, name='user_total_watch_time'), # Specification for like_count\n", + "}\n", + "\n", + "# Get the concrete function for the query_model's compute_emb function using the specified input signatures\n", + "signatures = query_model.compute_emb.get_concrete_function(instances_spec)\n", + "\n", + "# Save the query_model along with the concrete function signatures\n", + "tf.saved_model.save(\n", + " query_model, # The model to save\n", + " \"query_model\", # Path to save the model\n", + " signatures=signatures, # Concrete function signatures to include\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e69f5ee", + "metadata": {}, + "outputs": [], + "source": [ + "tf.saved_model.save(\n", + " model.item_model, # The model to save\n", + " \"candidate_model\", # Path to save the model\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdd40c96", + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.schema import Schema\n", + "from hsml.model_schema import ModelSchema\n", + "\n", + "# Infer input schema from data.\n", + "query_model_input_schema = Schema(query_df)\n", + "\n", + "# Manually specify output schema.\n", + "query_model_output_schema = Schema([{\n", + " \"name\": \"query_embedding\",\n", + " \"type\": \"float32\",\n", + " \"shape\": [EMB_DIM],\n", + "}])\n", + "\n", + "query_model_schema = ModelSchema(\n", + " input_schema=query_model_input_schema,\n", + " output_schema=query_model_output_schema,\n", + ")\n", + "\n", + "query_model_schema.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a482bc52", + "metadata": {}, + "outputs": [], + "source": [ + "# Sample a query example from the query DataFrame\n", + "query_example = query_df.sample().to_dict(\"records\")\n", + "\n", + "# Create a tensorflow model for the query_model in the Model Registry \n", + "mr_query_model = mr.tensorflow.create_model(\n", + " name=\"query_model\", # Name of the model\n", + " description=\"Model that generates query embeddings from user features\", # Description of the model\n", + " input_example=query_example, # Example input for the model\n", + " model_schema=query_model_schema, # Schema of the model\n", + ")\n", + "\n", + "# Save the query_model to the Model Registry\n", + "mr_query_model.save(\"query_model\") # Path to save the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4db480d", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the input schema for the candidate_model based on item_df\n", + "candidate_model_input_schema = Schema(item_df)\n", + "\n", + "# Define the output schema for the candidate_model, specifying the shape and type of the output\n", + "candidate_model_output_schema = Schema([{\n", + " \"name\": \"candidate_embedding\", # Name of the output feature\n", + " \"type\": \"float32\", # Data type of the output feature\n", + " \"shape\": [EMB_DIM], # Shape of the output feature\n", + "}])\n", + "\n", + "# Combine the input and output schemas to create the overall model schema for the candidate_model\n", + "candidate_model_schema = ModelSchema(\n", + " input_schema=candidate_model_input_schema, # Input schema for the model\n", + " output_schema=candidate_model_output_schema, # Output schema for the model\n", + ")\n", + "\n", + "# Sample a candidate example from the item DataFrame\n", + "candidate_example = item_df.sample().to_dict(\"records\")\n", + "\n", + "# Create a tensorflow model for the candidate_model in the Model Registry\n", + "mr_candidate_model = mr.tensorflow.create_model(\n", + " name=\"candidate_model\", # Name of the model\n", + " description=\"Model that generates candidate embeddings from video features\", # Description of the model\n", + " input_example=candidate_example, # Example input for the model\n", + " model_schema=candidate_model_schema, # Schema of the model\n", + ")\n", + "\n", + "# Save the candidate_model to the Model Registry\n", + "mr_candidate_model.save(\"candidate_model\") # Path to save the model" + ] + }, + { + "cell_type": "markdown", + "id": "4af8f206", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/3_embeddings_creation.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/3_embeddings_creation.ipynb new file mode 100644 index 00000000..794478af --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/3_embeddings_creation.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e9f877c8", + "metadata": {}, + "source": [ + "## 👨🏻‍🏫 Build Index \n", + "\n", + "In this notebook you will create a feature group for your candidate embeddings." + ] + }, + { + "cell_type": "markdown", + "id": "9f0949e2", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4958aa5b", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import pprint\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "8e8fc8ff", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "080e6b00", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "markdown", + "id": "8febd6fa", + "metadata": {}, + "source": [ + "## 🎯 Compute Candidate Embeddings \n", + "\n", + "You start by computing candidate embeddings for all items in the training data.\n", + "\n", + "First, you load your candidate model. Recall that you uploaded it to the Hopsworks Model Registry in the previous notebook. If you don't have the model locally you can download it from the Model Registry using the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7bae7bd", + "metadata": {}, + "outputs": [], + "source": [ + "model = mr.get_model(\n", + " name=\"candidate_model\",\n", + " version=1,\n", + ")\n", + "model_path = model.download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8c3373b", + "metadata": {}, + "outputs": [], + "source": [ + "candidate_model = tf.saved_model.load(model_path)" + ] + }, + { + "cell_type": "markdown", + "id": "6ba8c137", + "metadata": {}, + "source": [ + "Next you compute the embeddings of all candidate videos that were used to train the retrieval model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d01209f", + "metadata": {}, + "outputs": [], + "source": [ + "feature_view = fs.get_feature_view(\n", + " name=\"retrieval\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53786dd5", + "metadata": {}, + "outputs": [], + "source": [ + "train_df, val_df, test_df, _, _, _ = feature_view.get_train_validation_test_split(1)\n", + "train_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d7816e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the list of input features for the candidate model from the model schema\n", + "model_schema = model.model_schema['input_schema']['columnar_schema']\n", + "candidate_features = [feat['name'] for feat in model_schema]\n", + "\n", + "# Select the candidate features from the training DataFrame\n", + "item_df = train_df[candidate_features]\n", + "\n", + "# Drop duplicate rows based on the 'article_id' column to get unique candidate items\n", + "item_df.drop_duplicates(subset=\"video_id\", inplace=True)\n", + "\n", + "item_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "257abe7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a TensorFlow dataset from the item DataFrame\n", + "item_ds = tf.data.Dataset.from_tensor_slices(\n", + " {col: item_df[col] for col in item_df})\n", + "\n", + "# Compute embeddings for all candidate items using the candidate_model\n", + "candidate_embeddings = item_ds.batch(2048).map(\n", + " lambda x: (x[\"video_id\"], candidate_model(x))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2ad6ea91", + "metadata": {}, + "source": [ + "## ⚙️ Data Preparation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f51cbbfc", + "metadata": {}, + "outputs": [], + "source": [ + "# Concatenate all article IDs and embeddings from the candidate_embeddings dataset\n", + "all_article_ids = tf.concat([batch[0] for batch in candidate_embeddings], axis=0)\n", + "all_embeddings = tf.concat([batch[1] for batch in candidate_embeddings], axis=0)\n", + "\n", + "# Convert tensors to numpy arrays\n", + "all_article_ids_np = all_article_ids.numpy()\n", + "all_embeddings_np = all_embeddings.numpy()\n", + "\n", + "# Convert numpy arrays to lists\n", + "items_ids_list = all_article_ids_np.tolist()\n", + "embeddings_list = all_embeddings_np.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd1b7d49", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a DataFrame\n", + "data_emb = pd.DataFrame({\n", + " 'video_id': items_ids_list, \n", + " 'embeddings': embeddings_list,\n", + "})\n", + "data_emb['video_id'] = data_emb['video_id'].str.decode('utf-8')\n", + "\n", + "data_emb.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3c131a8b", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation \n", + "\n", + "Now you are ready to create a feature group for your candidate embeddings.\n", + "\n", + "To begin with, you need to create your Embedding Index where you will specify the name of the embeddings feature and the embeddings length.\n", + "Then you attach this index to the FG." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbe6db98", + "metadata": {}, + "outputs": [], + "source": [ + "from hsfs import embedding\n", + "\n", + "# Create the Embedding Index\n", + "emb = embedding.EmbeddingIndex()\n", + "\n", + "emb.add_embedding(\n", + " \"embeddings\", # Embeddings feature name\n", + " len(data_emb[\"embeddings\"].iloc[0]), # Embeddings length\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d94a8821", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'candidate_embeddings_fg' feature group\n", + "candidate_embeddings_fg = fs.get_or_create_feature_group(\n", + " name=\"candidate_embeddings_fg\",\n", + " embedding_index=emb, # Specify the Embedding Index\n", + " primary_key=['video_id'],\n", + " version=1,\n", + " description='Embeddings for each video',\n", + " online_enabled=True,\n", + ")\n", + "\n", + "candidate_embeddings_fg.insert(data_emb)" + ] + }, + { + "cell_type": "markdown", + "id": "9b8b7d0e", + "metadata": {}, + "source": [ + "## 🪄 Feature View Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f886cff3", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'candidate_embeddings' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name=\"candidate_embeddings\",\n", + " version=1,\n", + " description='Embeddings of each article',\n", + " query=candidate_embeddings_fg.select([\"video_id\"]),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a3dd3246", + "metadata": {}, + "source": [ + "---\n", + "## ⏩️ Next Steps \n", + "\n", + "At this point you have a recommender system that is able to generate a set of candidate videos for a user. However, many of these could be poor, as the candidate model was trained with only a few subset of the features. In the next notebook, you'll create a ranking dataset to train a *ranking model* to do more fine-grained predictions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/4_train_ranking_model.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/4_train_ranking_model.ipynb new file mode 100644 index 00000000..b380f5bb --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/4_train_ranking_model.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "18fd1ed3", + "metadata": {}, + "source": [ + "## 👨🏻‍🏫 Train Ranking Model \n", + "\n", + "In this notebook, you will train a ranking model using gradient boosted trees. " + ] + }, + { + "cell_type": "markdown", + "id": "a8839b46", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05197280", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from catboost import CatBoostClassifier, Pool\n", + "from sklearn.metrics import classification_report, precision_recall_fscore_support\n", + "import joblib" + ] + }, + { + "cell_type": "markdown", + "id": "229c5069", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23ea515e", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3de921ac", + "metadata": {}, + "outputs": [], + "source": [ + "users_fg = fs.get_feature_group(\n", + " name=\"users\",\n", + " version=1,\n", + ")\n", + "\n", + "user_window_agg_1h_fg = fs.get_feature_group(\n", + " name=\"user_window_agg_1h\",\n", + " version=1,\n", + ")\n", + "\n", + "\n", + "videos_fg = fs.get_feature_group(\n", + " name=\"videos\",\n", + " version=1,\n", + ")\n", + "\n", + "rank_fg = fs.get_feature_group(\n", + " name=\"ranking\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d854c2d2", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "017d4302", + "metadata": {}, + "outputs": [], + "source": [ + "# Select features\n", + "selected_features_customers = users_fg.select_except([\"registration_month\", \"registration_date\", \"registration_month\", \"user_id\"])\\\n", + " .join(user_window_agg_1h_fg.select([\"view_count\", \"dislike_count\", \"like_count\", \"total_watch_time\"]),\n", + " on=[\"user_id\"], prefix= \"user_\")\n", + "\n", + "fs.get_or_create_feature_view( \n", + " name='users',\n", + " query=selected_features_customers,\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65a8bc35", + "metadata": {}, + "outputs": [], + "source": [ + "# Select features\n", + "selected_features_articles = videos_fg.select_except([\"upload_month\", \"upload_date\", \"upload_month\", \"video_id\"])\n", + "\n", + "fs.get_or_create_feature_view(\n", + " name='videos',\n", + " query=selected_features_articles,\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26a85948", + "metadata": {}, + "outputs": [], + "source": [ + "# Select features\n", + "selected_features_ranking = rank_fg.select_except([\"user_id\", \"video_id\", \"interaction_month\", \"registration_date\", \"upload_month\", \"upload_date\", \"interaction_type\", \"registration_month\"])\n", + "\n", + "feature_view_ranking = fs.get_or_create_feature_view(\n", + " name='ranking',\n", + " query=selected_features_ranking,\n", + " labels=[\"label\"],\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0a474e88", + "metadata": {}, + "source": [ + "## 🗄️ Train Data loading " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a72b384", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(\n", + " test_size=0.1,\n", + " description='Ranking training dataset',\n", + ")\n", + "\n", + "X_train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dd2dbba", + "metadata": {}, + "outputs": [], + "source": [ + "y_train.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "00f10105", + "metadata": {}, + "source": [ + "## 🏃🏻‍♂️ Model Training \n", + "\n", + "Let's train a model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8531f5b", + "metadata": {}, + "outputs": [], + "source": [ + "cat_features = list(\n", + " X_train.select_dtypes(include=['string', 'object']).columns\n", + ")\n", + "\n", + "pool_train = Pool(X_train, y_train, cat_features=cat_features)\n", + "pool_val = Pool(X_val, y_val, cat_features=cat_features)\n", + "\n", + "model = CatBoostClassifier(\n", + " learning_rate=0.2,\n", + " iterations=100,\n", + " depth=10,\n", + " scale_pos_weight=10,\n", + " early_stopping_rounds=5,\n", + " use_best_model=True,\n", + ")\n", + "\n", + "model.fit(\n", + " pool_train, \n", + " eval_set=pool_val,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9c007b37", + "metadata": {}, + "source": [ + "## 👮🏻‍♂️ Model Validation \n", + "\n", + "Next, you'll evaluate how well the model performs on the validation data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412b3cc2", + "metadata": {}, + "outputs": [], + "source": [ + "preds = model.predict(pool_val)\n", + "\n", + "precision, recall, fscore, _ = precision_recall_fscore_support(y_val, preds, average=\"binary\")\n", + "\n", + "metrics = {\n", + " \"precision\" : precision,\n", + " \"recall\" : recall,\n", + " \"fscore\" : fscore,\n", + "}\n", + "print(classification_report(y_val, preds))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd127abe", + "metadata": {}, + "outputs": [], + "source": [ + "feat_to_score = {\n", + " feature: score \n", + " for feature, score \n", + " in zip(\n", + " X_train.columns, \n", + " model.feature_importances_,\n", + " )\n", + "}\n", + "\n", + "feat_to_score = dict(\n", + " sorted(\n", + " feat_to_score.items(),\n", + " key=lambda item: item[1],\n", + " reverse=True,\n", + " )\n", + ")\n", + "feat_to_score" + ] + }, + { + "cell_type": "markdown", + "id": "b599b46a", + "metadata": {}, + "source": [ + "It can be seen that the model places high importance on user and item embedding features. Consequently, better trained embeddings could yield a better ranking model.\n", + "\n", + "Finally, you'll save your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab234527", + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(model, 'ranking_model.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "26ad7e59", + "metadata": {}, + "source": [ + "### 💾 Upload Model to Model Registry \n", + "\n", + "You'll upload the model to the Hopsworks Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbcba206", + "metadata": {}, + "outputs": [], + "source": [ + "# Connect to Hopsworks Model Registry\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4185f67f", + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.schema import Schema\n", + "from hsml.model_schema import ModelSchema\n", + "\n", + "input_example = X_train.sample().to_dict(\"records\")\n", + "input_schema = Schema(X_train)\n", + "output_schema = Schema(y_train)\n", + "model_schema = ModelSchema(input_schema, output_schema)\n", + "\n", + "ranking_model = mr.python.create_model(\n", + " name=\"ranking_model\", \n", + " metrics=metrics,\n", + " model_schema=model_schema,\n", + " input_example=input_example,\n", + " description=\"Ranking model that scores item candidates\",\n", + ")\n", + "ranking_model.save(\"ranking_model.pkl\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b4032da", + "metadata": {}, + "source": [ + "---\n", + "## ⏩️ Next Steps \n", + "\n", + "Now you have trained both a retrieval and a ranking model, which will allow you to generate recommendations for users. In the next notebook, you'll take a look at how you can deploy these models with the `HSML` library." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/5_create_deployments.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/5_create_deployments.ipynb new file mode 100644 index 00000000..c8c567ca --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/5_create_deployments.ipynb @@ -0,0 +1,785 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a62c0de", + "metadata": {}, + "source": [ + "## 👨🏻‍🏫 Create Deployment \n", + "\n", + "In this notebook, you'll create a deployment for your recommendation system.\n", + "\n", + "**NOTE Currently the transformer scripts are not implemented.**" + ] + }, + { + "cell_type": "markdown", + "id": "9326c452", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "159659de", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "214a333d", + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "ef743d42", + "metadata": {}, + "source": [ + "## 🔮 Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4729a4f9", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "# Connect to Hopsworks Model Registry\n", + "mr = project.get_model_registry()\n", + "\n", + "dataset_api = project.get_dataset_api()" + ] + }, + { + "cell_type": "markdown", + "id": "d064e89f", + "metadata": {}, + "source": [ + "## 🚀 Ranking Model Deployment \n" + ] + }, + { + "cell_type": "markdown", + "id": "b1879f08", + "metadata": {}, + "source": [ + "You start by deploying your ranking model. Since it is a CatBoost model you need to implement a `Predict` class that tells Hopsworks how to load the model and how to use it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea029818", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_model = mr.get_best_model(\n", + " name=\"ranking_model\", \n", + " metric=\"fscore\", \n", + " direction=\"max\",\n", + ")\n", + "ranking_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb425246", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_model.model_schema[\"input_schema\"][\"columnar_schema\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de1f5c67", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile ranking_transformer.py\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import hopsworks\n", + "from opensearchpy import OpenSearch\n", + "\n", + "import logging\n", + "\n", + "\n", + "class Transformer(object):\n", + " \n", + " def __init__(self):\n", + " # Connect to Hopsworks\n", + " project = hopsworks.connection().get_project()\n", + " self.fs = project.get_feature_store()\n", + " \n", + " # Retrieve the 'videos' feature view\n", + " self.videos_fv = self.fs.get_feature_view(\n", + " name=\"videos\", \n", + " version=1,\n", + " )\n", + " \n", + " # Get list of feature names for videos\n", + " self.video_features = [feat.name for feat in self.videos_fv.schema]\n", + " \n", + " # Retrieve the 'users' feature view\n", + " self.users_fv = self.fs.get_feature_view(\n", + " name=\"users\", \n", + " version=1,\n", + " )\n", + "\n", + " # Retrieve the 'candidate_embeddings' feature view\n", + " self.candidate_index = self.fs.get_feature_view(\n", + " name=\"candidate_embeddings\", \n", + " version=1,\n", + " )\n", + "\n", + " # Retrieve ranking model\n", + " mr = project.get_model_registry()\n", + " model = mr.get_model(\n", + " name=\"ranking_model\", \n", + " version=1,\n", + " )\n", + " \n", + " # Extract input schema from the model\n", + " input_schema = model.model_schema[\"input_schema\"][\"columnar_schema\"]\n", + " \n", + " # Get the names of features expected by the ranking model\n", + " self.ranking_model_feature_names = [feat[\"name\"] for feat in input_schema]\n", + " \n", + " def preprocess(self, inputs):\n", + " # Extract the input instance\n", + " inputs = inputs[\"instances\"][0]\n", + "\n", + " # Extract customer_id from inputs\n", + " user_id = inputs[\"user_id\"]\n", + " month_sin = inputs[\"month_sin\"]\n", + " month_cos = inputs[\"month_cos\"]\n", + " \n", + " # Search for candidate items\n", + " neighbors = self.candidate_index.find_neighbors(\n", + " inputs[\"query_emb\"], \n", + " k=100,\n", + " )\n", + " neighbors = [neighbor[0] for neighbor in neighbors]\n", + " \n", + " # Get IDs of items already bought by the customer\n", + " already_seen_videos_ids = self.fs.sql(\n", + " f\"SELECT video_id from interactions_1 WHERE user_id = '{user_id}'\", \n", + " online=True).values.reshape(-1).tolist()\n", + " \n", + " # Filter candidate items to exclude those already bought by the customer\n", + " video_id_list = [\n", + " video_id\n", + " for video_id \n", + " in neighbors \n", + " if video_id\n", + " not in already_seen_videos_ids\n", + " ]\n", + " \n", + " # Retrieve Article data for candidate items\n", + " videos_data = [\n", + " self.videos_fv.get_feature_vector({\"video_id\": video_id}) \n", + " for video_id \n", + " in video_id_list\n", + " ]\n", + "\n", + " ranking_model_inputs = pd.DataFrame(\n", + " data=videos_data, \n", + " columns=self.video_features,\n", + " )\n", + " \n", + " # Join candidate items with their features\n", + " ranking_model_inputs[\"video_id\"] = video_id_list\n", + " \n", + " # Add customer features\n", + " user_features = self.users_fv.get_feature_vector(\n", + " {\"user_id\": user_id}, \n", + " return_type=\"pandas\",\n", + " )\n", + " \n", + " ranking_model_inputs[\"gender\"] = user_features[\"gender\"].values[0] \n", + " ranking_model_inputs[\"age\"] = user_features[\"age\"].values[0] \n", + " ranking_model_inputs[\"country\"] = user_features[\"country\"].values[0] \n", + " ranking_model_inputs[\"month_sin\"] = month_sin\n", + " ranking_model_inputs[\"month_cos\"] = month_cos\n", + " \n", + " # Select only the features required by the ranking model\n", + " ranking_model_inputs = ranking_model_inputs[self.ranking_model_feature_names]\n", + " \n", + " return { \n", + " \"inputs\" : [{\"ranking_features\": ranking_model_inputs.values.tolist(), \"video_ids\": video_id_list}]\n", + " }\n", + "\n", + " def postprocess(self, outputs):\n", + " # Extract predictions from the outputs\n", + " preds = outputs[\"predictions\"]\n", + " \n", + " # Merge prediction scores and corresponding article IDs into a list of tuples\n", + " ranking = list(zip(preds[\"scores\"], preds[\"video_ids\"]))\n", + " \n", + " # Sort the ranking list by score in descending order\n", + " ranking.sort(reverse=True)\n", + " \n", + " # Return the sorted ranking list\n", + " return { \n", + " \"ranking\": ranking,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "155add21", + "metadata": {}, + "outputs": [], + "source": [ + "# Copy transformer file into Hopsworks File System \n", + "uploaded_file_path = dataset_api.upload(\n", + " \"ranking_transformer.py\", # File name to be uploaded\n", + " \"Resources\", # Destination directory in Hopsworks File System \n", + " overwrite=True, # Overwrite the file if it already exists\n", + ") \n", + "\n", + "# Construct the path to the uploaded transformer script\n", + "transformer_script_path = os.path.join(\n", + " \"/Projects\", # Root directory for projects in Hopsworks\n", + " project.name, # Name of the current project\n", + " uploaded_file_path, # Path to the uploaded file within the project\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e76f3ab", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile ranking_predictor.py\n", + "\n", + "import os\n", + "import joblib\n", + "import numpy as np\n", + "\n", + "import logging\n", + "\n", + "class Predict(object):\n", + " \n", + " def __init__(self):\n", + " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/ranking_model.pkl\")\n", + "\n", + " def predict(self, inputs):\n", + " # Extract ranking features and article IDs from the inputs\n", + " features = inputs[0].pop(\"ranking_features\")\n", + " video_ids = inputs[0].pop(\"video_ids\")\n", + " \n", + " # Log the extracted features\n", + " logging.info(\"predict -> \" + str(features))\n", + "\n", + " # Predict probabilities for the positive class\n", + " scores = self.model.predict_proba(features).tolist()\n", + " \n", + " # Get scores of positive class\n", + " scores = np.asarray(scores)[:,1].tolist() \n", + "\n", + " # Return the predicted scores along with the corresponding article IDs\n", + " return {\n", + " \"scores\": scores, \n", + " \"video_ids\": video_ids,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43e26068", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload predictor file to Hopsworks\n", + "uploaded_file_path = dataset_api.upload(\n", + " \"ranking_predictor.py\", \n", + " \"Resources\", \n", + " overwrite=True,\n", + ")\n", + "\n", + "# Construct the path to the uploaded script\n", + "predictor_script_path = os.path.join(\n", + " \"/Projects\", \n", + " project.name, \n", + " uploaded_file_path,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "17ac4124", + "metadata": {}, + "source": [ + "With that in place, you can finally deploy your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "913b28d2", + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.transformer import Transformer\n", + "\n", + "ranking_deployment_name = \"rankingdeployment\"\n", + "\n", + "# Define transformer\n", + "ranking_transformer=Transformer(\n", + " script_file=transformer_script_path, \n", + " resources={\"num_instances\": 1},\n", + ")\n", + "\n", + "# Deploy ranking model\n", + "ranking_deployment = ranking_model.deploy(\n", + " name=ranking_deployment_name,\n", + " description=\"Deployment that search for video candidates and scores them based on user metadata\",\n", + " script_file=predictor_script_path,\n", + " resources={\"num_instances\": 1},\n", + " transformer=ranking_transformer,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9151e785", + "metadata": {}, + "outputs": [], + "source": [ + "# Start the deployment\n", + "ranking_deployment.start()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f267cd6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Check logs in case of failure\n", + "#ranking_deployment.get_logs(component=\"predictor\", tail=200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "391032d2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_top_recommendations(ranked_candidates, k=3):\n", + " return [candidate[-1] for candidate in ranked_candidates['ranking'][:k]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d786e76e", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a test input example\n", + "test_ranking_input = {\"instances\": [{\n", + " \"user_id\": \"ED267E\",\n", + " \"month_sin\": 1.2246467991473532e-16,\n", + " \"month_cos\": -1.0,\n", + " \"query_emb\": [0.214135289,\n", + " 0.571055949,\n", + " 0.330709577,\n", + " -0.225899458,\n", + " -0.308674961,\n", + " -0.0115124583,\n", + " 0.0730511621,\n", + " -0.495835781,\n", + " 0.625569344,\n", + " -0.0438038409,\n", + " 0.263472944,\n", + " -0.58485353,\n", + " -0.307070434,\n", + " 0.0414443575,\n", + " -0.321789205,\n", + " 0.966559],\n", + "}]}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f548e470", + "metadata": {}, + "outputs": [], + "source": [ + "# Test ranking deployment\n", + "ranked_candidates = ranking_deployment.predict(test_ranking_input)\n", + "\n", + "# Retrieve article ids of the top recommended items\n", + "recommendations = get_top_recommendations(ranked_candidates, k=3)\n", + "recommendations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1f2888e", + "metadata": {}, + "outputs": [], + "source": [ + "# Check logs in case of failure\n", + "#ranking_deployment.get_logs(component=\"transformer\",tail=200)" + ] + }, + { + "cell_type": "markdown", + "id": "2ebce4de", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "5b8dafe6", + "metadata": {}, + "source": [ + "## 🚀 Query Model Deployment \n", + "\n", + "Next, you'll deploy your query model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08e5295a", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'query_model' from the Model Registry\n", + "query_model = mr.get_model(\n", + " name=\"query_model\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8b4889d", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile querymodel_transformer.py\n", + "\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "\n", + "import hopsworks\n", + "\n", + "import logging\n", + "import json\n", + "\n", + "\n", + "# Calculate ondemand features the sine and cosine of the month of interaction date\n", + "def month_sine(interaction_date): \n", + " # Calculate a coefficient for adjusting the periodicity of the month\n", + " coef = np.random.uniform(0, 2 * np.pi) / 12\n", + "\n", + " #month_of_purchase = datetime.strptime(transaction_date, \"%Y-%m-%dT%H:%M:%S\").month\n", + " month_of_interaction = interaction_date.month \n", + " \n", + " # Calculate the sine and cosine components for the month_of_purchase\n", + " return float(np.sin(month_of_interaction * coef)) \n", + "\n", + "def month_cosine(interaction_date): \n", + " # Calculate a coefficient for adjusting the periodicity of the month\n", + " coef = np.random.uniform(0, 2 * np.pi) / 12\n", + "\n", + " #month_of_purchase = datetime.strptime(transaction_date, \"%Y-%m-%dT%H:%M:%S\").month\n", + " month_of_interaction = interaction_date.month \n", + " \n", + " # Calculate the sine and cosine components for the month_of_purchase\n", + " return float(np.cos(month_of_interaction * coef))\n", + "\n", + " \n", + "class Transformer(object):\n", + " \n", + " def __init__(self): \n", + " # Connect to the Hopsworks\n", + " project = hopsworks.connection().get_project()\n", + " ms = project.get_model_serving()\n", + " \n", + " # Retrieve the 'users' feature view\n", + " fs = project.get_feature_store()\n", + " self.users_fv = fs.get_feature_view(\n", + " name=\"users\", \n", + " version=1,\n", + " )\n", + " # Retrieve the ranking deployment \n", + " self.ranking_server = ms.get_deployment(\"rankingdeployment\")\n", + " \n", + " self.logger = logging.getLogger(__name__)\n", + "\n", + " \n", + " \n", + " def preprocess(self, inputs):\n", + " # Check if the input data contains a key named \"instances\"\n", + " # and extract the actual data if present\n", + " inputs = inputs[\"instances\"] if \"instances\" in inputs else inputs\n", + " \n", + " # Extract customer_id from the inputs\n", + " user_id = inputs[\"user_id\"]\n", + " interaction_date = inputs.pop(\"interaction_date\")\n", + "\n", + " # Get customer features\n", + " user_features = self.users_fv.get_feature_vector(\n", + " {\"user_id\": user_id}, \n", + " return_type=\"pandas\",\n", + " )\n", + "\n", + " # Enrich inputs with customer age\n", + " inputs[\"gender\"] = user_features['gender'].values[0]\n", + " inputs[\"age\"] = user_features['age'].values[0] \n", + "\n", + " # Calculate the sine and cosine of the month_of_purchase\n", + " interaction_date = datetime.strptime(interaction_date, \"%Y-%m-%d %H:%M:%S\")\n", + " \n", + " # Calculate the sine and cosine components for the month_of_purchase\n", + " inputs[\"month_sin\"] = month_sine(interaction_date)\n", + " inputs[\"month_cos\"] = month_cosine(interaction_date)\n", + " \n", + " inputs[\"country\"] = user_features['country'].values[0]\n", + " inputs[\"user_dislike_count\"] = user_features['user_dislike_count'].values[0]\n", + " inputs[\"user_like_count\"] = user_features['user_like_count'].values[0]\n", + " inputs[\"user_total_watch_time\"] = user_features['user_total_watch_time'].values[0]\n", + " inputs[\"user_view_count\"] = user_features['user_view_count'].values[0]\n", + " \n", + " return {\n", + " \"instances\" : [inputs]\n", + " }\n", + " \n", + " def postprocess(self, outputs):\n", + " # Return ordered ranking predictions\n", + " return {\n", + " \"predictions\": self.ranking_server.predict({ \"instances\": outputs[\"predictions\"]}),\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96011659", + "metadata": {}, + "outputs": [], + "source": [ + "# Copy transformer file into Hopsworks File System\n", + "uploaded_file_path = dataset_api.upload(\n", + " \"querymodel_transformer.py\", \n", + " \"Models\", \n", + " overwrite=True,\n", + ")\n", + "\n", + "# Construct the path to the uploaded script\n", + "transformer_script_path = os.path.join(\n", + " \"/Projects\", \n", + " project.name, \n", + " uploaded_file_path,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da61600", + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.transformer import Transformer\n", + "\n", + "query_model_deployment_name = \"querydeployment\"\n", + "\n", + "# Define transformer\n", + "query_model_transformer=Transformer(\n", + " script_file=transformer_script_path, \n", + " resources={\"num_instances\": 1},\n", + ")\n", + "\n", + "# Deploy the query model\n", + "query_model_deployment = query_model.deploy(\n", + " name=query_model_deployment_name,\n", + " description=\"Deployment that generates query embeddings from user and video features using the query model\",\n", + " resources={\"num_instances\": 1},\n", + " transformer=query_model_transformer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ed2dcc62", + "metadata": {}, + "source": [ + "At this point, you have registered your deployment. To start it up you need to run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee977d2f", + "metadata": {}, + "outputs": [], + "source": [ + "# Start the deployment\n", + "query_model_deployment.start()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a42f4d7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Check logs in case of failure\n", + "# query_model_deployment.get_logs(component=\"transformer\", tail=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f410c3d", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a test input example\n", + "#data = {\"instances\": {\"user_id\": \"ED267E\"}}\n", + "\n", + "# Define a test input example\n", + "data = {\"instances\": {\"user_id\": \"ED267E\", \"interaction_date\": \"2024-02-10 15:33:11\"}}\n", + "\n", + "\n", + "# Test the deployment\n", + "ranked_candidates = query_model_deployment.predict(data)\n", + "\n", + "# Retrieve article ids of the top recommended items\n", + "recommendations = get_top_recommendations(\n", + " ranked_candidates['predictions'], \n", + " k=3,\n", + ")\n", + "recommendations\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2f56f00", + "metadata": {}, + "outputs": [], + "source": [ + "# Check logs in case of failure\n", + "#query_model_deployment.get_logs(component=\"transformer\",tail=200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c38a70e5", + "metadata": {}, + "outputs": [], + "source": [ + "#ranking_deployment.get_logs(component=\"transformer\",tail=200)" + ] + }, + { + "cell_type": "markdown", + "id": "280d386f", + "metadata": {}, + "source": [ + "Stop the deployment when you're not using it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a44ce70d", + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the ranking model deployment\n", + "ranking_deployment.stop()\n", + "\n", + "# Stop the query model deployment\n", + "query_model_deployment.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82be7ad5", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = data[\"instances\"][0]\n", + "\n", + "# Extract customer_id from the inputs\n", + "user_id = inputs[\"user_id\"]\n", + "interaction_date = inputs[\"interaction_date\"]\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d72050a", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/README.md b/advanced_tutorials/tiktok_recsys/python/Jupyter/README.md new file mode 100644 index 00000000..40beeb04 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/README.md @@ -0,0 +1,7 @@ +*This is an auto-generated README.md file for your Dataset!* +To replace it, go into your DataSet and edit the README.md file. + +*Jupyter* DataSet +=== + +## Contains Jupyter notebooks. \ No newline at end of file diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/feature_monitoring.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/feature_monitoring.ipynb new file mode 100644 index 00000000..fabe9b85 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/feature_monitoring.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9018b425", + "metadata": {}, + "source": [ + "![Screenshot from 2022-06-16 14-24-57.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "ed9b966b", + "metadata": {}, + "source": [ + "# 📈 Feature Monitoring\n", + "\n", + "This tutorial aims to help data science teams to implement feature monitoring in their production pipeline. ML-models rely heavily on stable data trends over time to keep making high quality predictions. It is however often cumbersome for data science teams to keep an eye on whether their assumptions hold true. To solve this problem, Hopsworks provides a way to schedule regular jobs on snapshots of data to test those assumptions. The regular collection of aggregated statistics on Feature data is an additional tool for data teams to help keep data quality high in the feature store, building confidence to re-use features across relevant projects. In addition, alerts and scheduled data collection reduce the maintenance burden for teams which have ML-models in production and enables you to detect data drift before customers start seeing an impact.\n", + "\n", + "Hopsworks feature monitoring capabilities can be summarised as:\n", + " - **Asynchronous**: The monitoring is performed in separate jobs from write operations.\n", + " - **Historical**: An isolated data point means nothing. By performing regular computations, unusual pattern can be discovered.\n", + " - **Production-oriented**: Feature monitoring is primarily a tool aimed at easing the burden of maintaining model in production.\n", + " - **Alerting**: Raise a warning that something that you did not expect has happened.\n", + " - **Reporting**: Guide the decision on whether to retrain a model or re-engineer a Feature.\n", + " - **Data Quality**: Aggregate and collect information about the quality of your data pipelines. \n", + "\n", + "Feature monitoring in Hopsworks is not intended for:\n", + " - Gate-keeping: Feature monitoring is aimed to monitor data already in Hopsworks, not to keep the data out.\n", + " - Real-time: Computing statistics on mini-batches of data is very noisy, which can mask long-term trends.\n", + " - Accurate Logging: Not logging data, only the summary of a snapshot in a given time-window \n", + "\n", + "## 📗 Feature Monitoring Documentation\n", + "For more information on the intricacies of Feature Monitoring, see the [Feature Monitoring Guide](https://docs.hopsworks.ai/latest/user_guides/fs/feature_monitoring/).\n", + "\n", + "## 🗒️ This notebook is divided in 3 sections:\n", + "0. (Requirement and setup)\n", + "1. Get started quickly with Scheduled Statistics\n", + "2. Setup Feature Monitoring with Reference windows\n", + "3. Explore Feature Monitoring results\n", + "4. List Feature Monitoring configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78326a44", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "import numpy as np\n", + "import pandas as pd\n", + "import datetime\n", + "\n", + "project = hopsworks.login()\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "80941734", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "2731bad3", + "metadata": {}, + "source": [ + "## 🪟 Feature Monitoring and Reference Windows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "303e40a2-6221-41aa-8c5c-fdad2582dc98", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg = fs.get_feature_group(\n", + " name=\"interactions\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0693b884-859b-4280-9c54-28ce253d7745", + "metadata": {}, + "outputs": [], + "source": [ + "fg_watch_time_monitoring_reference_sliding = interactions_fg.create_feature_monitoring(\n", + " name=\"fg_watch_time_monitoring_reference_sliding\",\n", + " feature_name=\"watch_time\",\n", + " cron_expression= \"0 0 * ? * * *\",#\"0 0 12 ? * * *\",\n", + " description=\"Compute and compare descriptive statistics on the watch_time Feature on a daily basis to the same statistics computed in the previous week\",\n", + ").with_detection_window(\n", + " time_offset=\"1d\", # fetch data from inserted throughout the last day\n", + " row_percentage=0.2,\n", + ").with_reference_window(\n", + " time_offset=\"1w1d\", # fetch data from the start of same day of the previous week\n", + " window_length=\"1d\", # limit the reference window to the same day of the previous week\n", + " row_percentage=0.2,\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=0.1, # allow for a 10% difference between the two windows before triggering an alert\n", + ").save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53e5cd5a-204a-4432-83d9-bb18cc88b50b", + "metadata": {}, + "outputs": [], + "source": [ + "retrieval_fv = fs.get_feature_view(\"retrieval\", 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ecd419", + "metadata": {}, + "outputs": [], + "source": [ + "fv_age_monitoring_reference_td = retrieval_fv.create_feature_monitoring(\n", + " name=\"fv_vid_total_watch_time_monitoring_reference_td\",\n", + " feature_name=\"age\",\n", + " cron_expression= \"0 0 * ? * * *\",#\"0 0 12 ? * * *\",\n", + " description=\"Compute and compare descriptive statistics on the age Feature in the last hour of data inserted in the Feature View\",\n", + ").with_detection_window(\n", + " time_offset=\"1h\", # fetch data from the last hour\n", + " row_percentage=0.2,\n", + ").with_reference_training_dataset(\n", + " training_dataset_version=1, # use the training dataset used to train your production model\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=50,\n", + ").save()" + ] + }, + { + "cell_type": "markdown", + "id": "9e2b659e", + "metadata": {}, + "source": [ + "## 📉 Feature Monitoring results and Hopsworks Interactive Graph\n", + "\n", + "So far we have discussed how the python API allows you to quickly setup monitoring jobs and how to customize them to cater to your project needs. We want to focus now on getting the full values from the data points computed by the monitoring jobs. Each data point is stored in the database Feature Monitoring Result. A typical result has the following structure:\n", + " - Result Metadata, including the time at which the job was executed\n", + " - Detection Statistics like count or num_null_values, as well as metric specific to different data types (e.g mean for numerical value)\n", + " - Reference Statistics, if a reference window was defined\n", + " - Comparison information : data shift detected, difference, etc...\n", + "\n", + "Hopsworks UI is the easiest place to get started with monitoring results. You can select a Feature Group or Feature View and see the results of the monitoring jobs in the [Hopsworsk Interactive Graph](https://docs.hopsworks.ai/latest/user_guides/fs/feature_monitoring/interactive_graph). The results are displayed as a Time-series to visualise trend in the monitoring data.\n", + "\n", + "Additionally, you can use the python API to retrieve the monitoring results and plot them as you see fit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07c863f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch results\n", + "monitoring_results = fg_watch_time_monitoring_reference_sliding.get_history(\n", + " start_time=datetime.datetime.now() - datetime.timedelta(days=1), # fetched data inserted in the last day\n", + " end_time=datetime.datetime.now(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0a47a6f9", + "metadata": {}, + "source": [ + "## ⚙️ List Feature Monitoring configurations\n", + "\n", + "Finally, you can list the Feature Monitoring configurations created for your Feature Group or Feature View using the Python API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58483800", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg.get_feature_monitoring_configs()" + ] + }, + { + "cell_type": "markdown", + "id": "626aae5e", + "metadata": {}, + "source": [ + "or retrieve a specific configuration by providing the name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3372c2d8", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_fg.get_feature_monitoring_configs(name=\"fg_watch_time_monitoring_reference_sliding\")" + ] + }, + { + "cell_type": "markdown", + "id": "c13fbc94", + "metadata": {}, + "source": [ + "## Conclusion \n", + "\n", + "Hopsworks simplifies Feature Monitoring by allowing to schedule monitoring jobs and store the results. In this notebook, we have shown how to get started with Feature Monitoring in Hopsworks. Starting early will allow you to compound knowledge as every new job provides new data points revealing hidden trends in your evolving data. Once you have a better understanding you can use Hopsworks customization options to refine and optimize feature monitoring to match your production context. \n", + "\n", + "Hopsworks UI allows you to quickly visualise timeseries to monitor the evolution of your data. You can also use the python API to retrieve the monitoring results and plot them to match more advance use cases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67d4e89-fd68-47d7-aad9-665ab038dc65", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/features/interactions.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/interactions.py new file mode 100644 index 00000000..559105f0 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/interactions.py @@ -0,0 +1,213 @@ +from mimesis import Generic +from mimesis.locales import Locale +import random +from datetime import datetime, timedelta +from typing import List, Dict, Any +from streaming import config +import numpy as np + +def generate_interactions(num_interactions: int, users: List[Dict[str, str]], videos: List[Dict[str, str]]) -> List[ + Dict[str, Any]]: + """ + Generate a list of dictionaries, each representing an interaction between a user and a video. + + This function creates interaction data by randomly pairing users with videos and assigning + interaction details like interaction type, watch time, and whether the video was watched till the end. + The likelihood of a video being watched till the end is inversely proportional to its length. + + Args: + num_interactions (int): The number of interactions to generate. + users (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains user data. + videos (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains video data. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each containing interaction data. + """ + generic = Generic(locale=Locale.EN) + interactions = [] # List to store generated interaction data + + for _ in range(num_interactions): + user = random.choice(users) + video = random.choice(videos) + + # Parse dates from strings + user_registration_date = datetime.strptime(user['registration_date'], config.DATE_TIME_FORMAT) + video_upload_date = datetime.strptime(video['upload_date'], config.DATE_TIME_FORMAT) + + # Determine the earliest possible date for the interaction + earliest_date = max(user_registration_date, video_upload_date) + + # Generate a random date for the interaction + days_since_earliest = (datetime.now() - earliest_date).days + random_days = random.randint(0, days_since_earliest) + interaction_date = earliest_date + timedelta(days=random_days) + + previous_interaction_date = interaction_date - timedelta(days=random.randint(0, random.randint(0, 90))) + + interaction_types = ['like', 'dislike', 'view', 'comment', 'share', 'skip'] + weights = [1.5, 0.2, 3, 0.5, 0.8, 10] + + # Generate watch time and determine if the video was watched till the end + watch_time = random.randint(1, video['video_length']) + + probability_watched_till_end = 1 - (watch_time / video['video_length']) + watched_till_end = random.random() < probability_watched_till_end + + if watched_till_end: + watch_time = video['video_length'] # Adjust watch time to video length if watched till the end + + # Constructing the interaction dictionary + interaction = { + 'interaction_id': generic.person.identifier(mask='####-##-####'), + 'user_id': user['user_id'], + 'video_id': video['video_id'], + 'category_id': video['category_id'], + 'interaction_type': random.choices(interaction_types, weights=weights, k=1)[0], + 'watch_time': watch_time, + 'interaction_date': interaction_date.strftime(config.DATE_TIME_FORMAT), + 'previous_interaction_date': previous_interaction_date.strftime(config.DATE_TIME_FORMAT), + 'interaction_month': interaction_date.strftime(config.MONTH_FORMAT), + } + + interactions.append(interaction) # Add the interaction to the list + + return interactions + + +def generate_user_interactions_window_agg(num_interactions: int, users: List[Dict[str, str]], + videos: List[Dict[str, str]]) -> List[Dict[str, Any]]: + """ + Generate a list of dictionaries, each representing an interaction between a user and a video. + + This function creates interaction data by randomly pairing users with videos and assigning + interaction details like interaction type, watch time, and whether the video was watched till the end. + The likelihood of a video being watched till the end is inversely proportional to its length. + + Args: + num_interactions (int): The number of interactions to generate. + users (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains user data. + videos (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains video data. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each containing interaction data. + """ + generic = Generic(locale=Locale.EN) + interactions = [] # List to store generated interaction data + + for _ in range(num_interactions): + user = random.choice(users) + video = random.choice(videos) + + # Parse dates from strings + user_registration_date = datetime.strptime(user['registration_date'], config.DATE_TIME_FORMAT) + video_upload_date = datetime.strptime(video['upload_date'], config.DATE_TIME_FORMAT) + + # Determine the earliest possible date for the interaction + earliest_date = max(user_registration_date, video_upload_date) + + # Generate interaction + interaction_types = ['like', 'dislike', 'view', 'comment', 'share', 'skip'] + weights = [1.5, 0.2, 3, 0.5, 0.8, 10] + + # Constructing the interaction dictionary + interaction_date = video_upload_date + timedelta(hours=random.randint(0, 100)) + interaction = { + 'user_id': user['user_id'], + 'category_id': video['category_id'], + + 'window_end_time': interaction_date.strftime(config.DATE_TIME_FORMAT), + 'interaction_month': interaction_date.strftime(config.MONTH_FORMAT), + + "like_count": random.randint(0, 100), + "dislike_count": random.randint(0, 100), + "view_count": random.randint(0, 100), + "comment_count": random.randint(0, 100), + "share_count": random.randint(0, 100), + "skip_count": random.randint(0, 100), + "total_watch_time": random.randint(0, 100), + } + + interactions.append(interaction) # Add the interaction to the list + + return interactions + + +def generate_video_interactions_window_agg(num_interactions: int, users: List[Dict[str, str]], + videos: List[Dict[str, str]]) -> List[Dict[str, Any]]: + """ + Generate a list of dictionaries, each representing an interaction between a user and a video. + + This function creates interaction data by randomly pairing users with videos and assigning + interaction details like interaction type, watch time, and whether the video was watched till the end. + The likelihood of a video being watched till the end is inversely proportional to its length. + + Args: + num_interactions (int): The number of interactions to generate. + users (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains user data. + videos (List[Dict[str, str]]): A list of dictionaries, where each dictionary contains video data. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each containing interaction data. + """ + generic = Generic(locale=Locale.EN) + interactions = [] # List to store generated interaction data + + for _ in range(num_interactions): + user = random.choice(users) + video = random.choice(videos) + + # Parse dates from strings + user_registration_date = datetime.strptime(user['registration_date'], config.DATE_TIME_FORMAT) + video_upload_date = datetime.strptime(video['upload_date'], config.DATE_TIME_FORMAT) + + # Determine the earliest possible date for the interaction + earliest_date = max(user_registration_date, video_upload_date) + + # Generate interaction + interaction_types = ['like', 'dislike', 'view', 'comment', 'share', 'skip'] + weights = [1.5, 0.2, 3, 0.5, 0.8, 10] + + # Constructing the interaction dictionary + interaction_date = video_upload_date + timedelta(hours=random.randint(0, 100)) + interaction = { + 'video_id': video['video_id'], + 'category_id': video['category_id'], + + 'window_end_time': interaction_date.strftime(config.DATE_TIME_FORMAT), + 'interaction_month': interaction_date.strftime(config.MONTH_FORMAT), + + "like_count": random.randint(0, 100), + "dislike_count": random.randint(0, 100), + "view_count": random.randint(0, 100), + "comment_count": random.randint(0, 100), + "share_count": random.randint(0, 100), + "skip_count": random.randint(0, 100), + "total_watch_time": random.randint(0, 100), + + } + + interactions.append(interaction) # Add the interaction to the list + + return interactions + +# Calculate ondemand features the sine and cosine of the month of interaction date +def month_sine(interaction_date): + # Calculate a coefficient for adjusting the periodicity of the month + coef = np.random.uniform(0, 2 * np.pi) / 12 + + #month_of_purchase = datetime.strptime(transaction_date, "%Y-%m-%dT%H:%M:%S").month + month_of_interaction = interaction_date.month + + # Calculate the sine and cosine components for the month_of_purchase + return float(np.sin(month_of_interaction * coef)) + +def month_cosine(interaction_date): + # Calculate a coefficient for adjusting the periodicity of the month + coef = np.random.uniform(0, 2 * np.pi) / 12 + + #month_of_purchase = datetime.strptime(transaction_date, "%Y-%m-%dT%H:%M:%S").month + month_of_interaction = interaction_date.month + + # Calculate the sine and cosine components for the month_of_purchase + return float(np.cos(month_of_interaction * coef)) + diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/features/users.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/users.py new file mode 100644 index 00000000..a4c4c8cc --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/users.py @@ -0,0 +1,42 @@ +from mimesis import Generic +from mimesis.locales import Locale +import random +from datetime import datetime, timedelta +from typing import List, Dict +from streaming import config + +def generate_users(num_users: int, historical=False) -> List[Dict[str, str]]: + """ + Generate a list of dictionaries, each representing a user with various attributes. + + The function creates fake user data including user ID, gender, age, and country + using the mimesis library. The user ID is generated based on a specified mask. + + Args: + num_users (int): The number of user profiles to generate. + + Returns: + List[Dict[str, str]]: A list of dictionaries, each containing details of a user. + """ + generic = Generic(locale=Locale.EN) + users = [] # List to store generated user data + + for _ in range(num_users): + if historical: + days_ago = random.randint(0, 730) # Choose a random number of days up to two years + registration_date = datetime.now() - timedelta(days=days_ago) # Compute the date of registration + else: + registration_date = datetime.now() + + # Generate each user's details + user = { + 'user_id': generic.person.identifier(mask='@@###@'), # Unique user identifier + 'gender': generic.person.gender(), # Randomly generated gender + 'age': random.randint(12, 90), # Randomly generated age between 12 and 90 + 'country': generic.address.country(), # Randomly generated country name + 'registration_date': registration_date.strftime(config.DATE_TIME_FORMAT), + 'registration_month': registration_date.strftime(config.MONTH_FORMAT), + } + users.append(user) # Add the user to the list + + return users diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/features/videos.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/videos.py new file mode 100644 index 00000000..867aa4c5 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/features/videos.py @@ -0,0 +1,52 @@ +from mimesis import Generic +from mimesis.locales import Locale +import random +from datetime import datetime, timedelta +from typing import List, Dict, Any +from streaming import config + +def generate_video_content(num_videos: int, historical=False) -> List[Dict[str, str]]: + """ + Generate a list of dictionaries, each representing video content with various attributes. + + Each video includes details such as a unique video ID, category, + video length in seconds, and the upload date. The function uses the mimesis library + for generating random data and Python's random module for numerical attributes. + + Args: + num_videos (int): The number of video entries to generate. + + Returns: + List[Dict[str, str]]: A list of dictionaries, each containing details of a video. + """ + generic = Generic(locale=Locale.EN) + videos = [] # List to store generated video data + + for _ in range(num_videos): + if historical: + days_ago = random.randint(0, 730) # Choose a random number of days up to two years + upload_date = datetime.now() - timedelta(days=days_ago) # Compute the upload date + + else: + upload_date = datetime.now() + + categories = ['Education', 'Entertainment', 'Lifestyle', 'Music', 'News', 'Sports', 'Technology', 'Dance', 'Cooking', 'Comedy', 'Travel'] + categories_dict = {'Education': 1, 'Entertainment': 2, 'Lifestyle': 3, 'Music': 4, 'News': 5, 'Sports': 6, 'Technology': 7, 'Dance': 8, 'Cooking': 9, 'Comedy': 10, 'Travel': 11} + + video_length_seconds = random.randint(10, 250) # Video length in seconds + video_category = random.choice(categories) + + video = { + 'video_id': generic.person.identifier(mask='#@@##@'), # Unique video identifier + 'category_id': categories_dict[video_category], + 'category': video_category, + 'video_length': video_length_seconds, + 'upload_date': upload_date.strftime(config.DATE_TIME_FORMAT), + 'upload_month': upload_date.strftime(config.MONTH_FORMAT) + } + + videos.append(video) # Add the video to the list + + return videos + + diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/1_kafka_topic.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/1_kafka_topic.py new file mode 100644 index 00000000..960415a5 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/1_kafka_topic.py @@ -0,0 +1,77 @@ +import config +import hopsworks + +# Login to Hopsworks project +project = hopsworks.login() + +# Access Kafka API +kafka_api = project.get_kafka_api() + +# Define the schema for Kafka messages +schema = { + "type": "record", + "name": config.SCHEMA_NAME, + "namespace": "ai.hopsworks.examples.bytewax.interactions", + "fields": [ + { + "name": "interaction_id", + "type": [ + "null", + "string" + ] + }, + { + "name": "user_id", + "type": [ + "null", + "string" + ] + }, + { + "name": "video_id", + "type": [ + "null", + "string" + ] + }, + { + "name": "interaction_type", + "type": [ + "null", + "string" + ] + }, + { + "name": "watch_time", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ] + } + ] +} + +# Create schema in Hopsworks +kafka_api.create_schema( + config.SCHEMA_NAME, + schema, +) + +# Create Kafka topic +kafka_api.create_topic( + config.KAFKA_TOPIC_NAME, + config.SCHEMA_NAME, + 1, + partitions=1, + replicas=1, +) diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/2_simulation.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/2_simulation.py new file mode 100644 index 00000000..6b54bbc4 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/2_simulation.py @@ -0,0 +1,57 @@ +import json +import sys +sys.path.insert(1, '../') + +import pandas as pd +from tqdm import tqdm +import hopsworks +from confluent_kafka import Producer + +import config +from utils.hsfs_bytewax import get_kafka_config +from features.users import generate_users +from features.videos import generate_video_content +from features.interactions import generate_interactions + +def simulate_interactions(): + # Generate data for users + user_data = generate_users(config.USERS_AMOUNT_PIPELINE) + data_users_df = pd.DataFrame(user_data) + + # Generate data for videos + video_data = generate_video_content(config.VIDEO_AMOUNT_PIPELINE) + data_video_df = pd.DataFrame(video_data) + + # Generate interactions + interactions = generate_interactions( + config.INTERACTIONS_AMOUNT_PIPELINE, + user_data, + video_data, + ) + data_interactions_df = pd.DataFrame(interactions) + + data_interactions_df['json'] = data_interactions_df.apply(lambda x: x.to_json(), axis=1) + + return [json.loads(i) for i in data_interactions_df.json.values] + + +# Connect to Hopsworks +project = hopsworks.login() +fs = project.get_feature_store() + +kafka_api = project.get_kafka_api() +kafka_config = get_kafka_config(fs.id) + +print(kafka_config) +producer = Producer(kafka_config) + +# Simulate interactions +interactions_data = simulate_interactions() + +# Send to source topic +for interaction in tqdm(interactions_data, desc="Sending messages"): + producer.produce( + config.KAFKA_TOPIC_NAME, + json.dumps(interaction) + ) + producer.flush() diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/3_streaming_pipeline.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/3_streaming_pipeline.py new file mode 100644 index 00000000..ebe7df7f --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/3_streaming_pipeline.py @@ -0,0 +1,106 @@ +import json +from datetime import datetime, timedelta, timezone +import statistics + +from bytewax.dataflow import Dataflow +from bytewax import operators as op +from bytewax.operators.window import EventClockConfig, TumblingWindow +from bytewax.connectors.kafka import operators as kop +import bytewax.operators.window as win +import hopsworks + +from utils.hsfs_bytewax import get_kafka_config, serialize_with_key, sink_kafka +import config + +def parse_value(msg): + """Parse the JSON payload from a Kafka message into a Python dictionary.""" + return json.loads(msg.value.decode('utf-8')) + +def get_event_time(event): + """Retrieve and convert the event's datetime from the input to a timezone-aware datetime object.""" + return datetime.fromisoformat(event["interaction_date"]).replace(tzinfo=timezone.utc) + +def accumulate(acc, event): + """Accumulate watch times for each event to compute mean later.""" + acc.append(event["watch_time"]) + return acc + +def format_event(event): + """Calculate and format the aggregated results for output.""" + key, (metadata, data) = event + mean_watch_time = statistics.mean(data) if data else 0 + return { + "video_id": key, + "week_start": metadata.start.isoformat(), + "mean_watch_time": mean_watch_time, + "interaction_count": len(data) + } + +def setup_dataflow(feature_group_name, feature_group_version, hopsworks_host, hopsworks_project, hopsworks_api_key): + """Configure and return a Bytewax dataflow for aggregating video interaction data.""" + # Connect to hopsworks + project = hopsworks.login( + host=hopsworks_host, + project=hopsworks_project, + api_key_value=hopsworks_api_key + ) + fs = project.get_feature_store() + + # Get feature group and its topic configuration + feature_group = fs.get_feature_group(feature_group_name, feature_group_version) + + flow = Dataflow("video_interaction_aggregation") + + # Setup Kafka source + kafka_config = get_kafka_config(feature_store_id=fs.id) + stream = kop.input( + "kafka_in", + flow, + brokers=[kafka_config['bootstrap.servers']], + topics=[config.KAFKA_TOPIC_NAME], + ) + + # Parse messages from Kafka + parsed_stream = op.map("parse_value", stream.oks, parse_value) + keyed_stream = op.key_on("key_on_video", parsed_stream, lambda e: e["video_id"]) + + # Configure weekly windows + clock = EventClockConfig( + get_event_time, + wait_for_system_duration=timedelta(seconds=10), + ) + week_window = TumblingWindow( + length=timedelta(days=7), + offset=timedelta(days=-datetime.utcnow().weekday()), + ) + + # Window aggregation for mean watch time + windowed_stream = win.fold_window( + "aggregate_watch_time", + keyed_stream, + clock, + week_window, + list, + accumulate, + ) + formatted_stream = op.map( + "format_event", + windowed_stream, + format_event, + ) + + # Output the formatted stream to another Kafka topic + kop.output( + "kafka_out", + formatted_stream, + brokers=[kafka_config['bootstrap.servers']], + topic=feature_group._online_topic_name, + add_config=kafka_config, + ) + + return flow + +# Initialize and run the dataflow +if __name__ == "__main__": + flow = setup_dataflow() + flow.run() diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/config.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/config.py new file mode 100644 index 00000000..24ebcbc4 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/config.py @@ -0,0 +1,14 @@ +USERS_AMOUNT_HISTORICAL = 100_000 +VIDEO_AMOUNT_HISTORICAL = 100_000 +INTERACTIONS_AMOUNT_HISTORICAL = 2000_000 + +USERS_AMOUNT_PIPELINE = 1_000 +VIDEO_AMOUNT_PIPELINE = 1_000 +INTERACTIONS_AMOUNT_PIPELINE = 10_000 + +DATE_TIME_FORMAT = '%Y-%m-%d %H:%M:%S' +DAY_FORMAT = '%Y-%m-%d' +MONTH_FORMAT = '%Y-%m' + +KAFKA_TOPIC_NAME = "interactions_streaming_test_trial1" +SCHEMA_NAME = "interactions_streaming_test_trial_schema1" diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/test.ipynb b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/test.ipynb new file mode 100644 index 00000000..eb79b241 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/test.ipynb @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "155601e7", + "metadata": {}, + "source": [ + "---\n", + "export HOPSWORKS_HOST=60342400-fd68-11ee-a374-5db5bf1f1917.cloud.hopsworks.ai\n", + "\n", + "export HOPSWORKS_PROJECT_NAME=Bytewax_pipeline\n", + "\n", + "export HOPSWORKS_API_KEY=NJ9njtdzAdDmoAdL.VcPV9NacRISOXnsRVOglr8fQM9HhsiujewZJDzOzeBHlPyTjNV7Z73tYL7BxNGXJ\n", + "\n", + "export FEATURE_GROUP_NAME=interactions\n", + "\n", + "export FEATURE_GROUP_VERSION=1\n", + "\n", + "---\n", + "python 1_kafka_topic.py\n", + "\n", + "python 2_simulation.py\n", + "\n", + "python -m bytewax.run \"3_streaming_pipeline:setup_dataflow('$FEATURE_GROUP_NAME', $FEATURE_GROUP_VERSION, '$HOPSWORKS_HOST', '$HOPSWORKS_PROJECT_NAME', '$HOPSWORKS_API_KEY')\" \n", + "\n", + "RUST_BACKTRACE=1 python -m bytewax.run \"3_streaming_pipeline:setup_dataflow('$FEATURE_GROUP_NAME', $FEATURE_GROUP_VERSION, '$HOPSWORKS_HOST', '$HOPSWORKS_PROJECT_NAME', '$HOPSWORKS_API_KEY')\"\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "48340d75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'interaction_id': '1422-91-0556',\n", + " 'user_id': 'OU595D',\n", + " 'video_id': '2ZU94X',\n", + " 'interaction_type': 'skip',\n", + " 'watch_time': 87,\n", + " 'interaction_date': '2024-04-18 14:20:59'},\n", + " {'interaction_id': '1892-80-4966',\n", + " 'user_id': 'GC019W',\n", + " 'video_id': '4WO41I',\n", + " 'interaction_type': 'skip',\n", + " 'watch_time': 122,\n", + " 'interaction_date': '2024-04-18 14:20:59'},\n", + " {'interaction_id': '3044-36-7740',\n", + " 'user_id': 'IG087J',\n", + " 'video_id': '1KS47H',\n", + " 'interaction_type': 'view',\n", + " 'watch_time': 20,\n", + " 'interaction_date': '2024-04-18 14:20:59'}]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import sys\n", + "sys.path.insert(1, '../')\n", + "\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "import hopsworks\n", + "from confluent_kafka import Producer\n", + "\n", + "import config\n", + "from utils.hsfs_bytewax import get_kafka_config\n", + "from features.users import generate_users\n", + "from features.videos import generate_video_content\n", + "from features.interactions import generate_interactions\n", + "\n", + "def simulate_interactions():\n", + " # Generate data for users\n", + " user_data = generate_users(config.USERS_AMOUNT_PIPELINE)\n", + " data_users_df = pd.DataFrame(user_data)\n", + "\n", + " # Generate data for videos\n", + " video_data = generate_video_content(config.VIDEO_AMOUNT_PIPELINE)\n", + " data_video_df = pd.DataFrame(video_data)\n", + "\n", + " # Generate interactions\n", + " interactions = generate_interactions(\n", + " config.INTERACTIONS_AMOUNT_PIPELINE, \n", + " user_data, \n", + " video_data,\n", + " )\n", + " data_interactions_df = pd.DataFrame(interactions)\n", + " \n", + " data_interactions_df['json'] = data_interactions_df.apply(lambda x: x.to_json(), axis=1)\n", + " \n", + " return [json.loads(i) for i in data_interactions_df.json.values]\n", + "\n", + "data = simulate_interactions()\n", + "data[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3538d673", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection closed.\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://60342400-fd68-11ee-a374-5db5bf1f1917.cloud.hopsworks.ai/p/119\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "{'bootstrap.servers': '172.16.4.25:9091', 'security.protocol': 'SSL', 'ssl.endpoint.identification.algorithm': 'none', 'ssl.ca.location': '/tmp/kafka_sc_119_-1_ca_chain.pem', 'ssl.certificate.location': '/tmp/kafka_sc_119_-1_client_cert.pem', 'ssl.key.location': '/tmp/kafka_sc_119_-1_client_key.pem'}\n" + ] + } + ], + "source": [ + "# Connect to Hopsworks\n", + "project = hopsworks.login()\n", + "fs = project.get_feature_store()\n", + "\n", + "kafka_api = project.get_kafka_api()\n", + "kafka_config = get_kafka_config(fs.id)\n", + "\n", + "print(kafka_config)\n", + "producer = Producer(kafka_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f133a7bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'interaction_id': '9812-78-8238',\n", + " 'user_id': 'RR951P',\n", + " 'video_id': '7YP51D',\n", + " 'interaction_type': 'skip',\n", + " 'watch_time': 157,\n", + " 'interaction_date': '2024-04-18 11:33:30'}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Simulate interactions\n", + "interactions_data = simulate_interactions()\n", + "interactions_data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d64e1222", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending messages: 30%|███ | 3/10 [00:00<00:00, 21.78it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'interaction_id': '9812-78-8238', 'user_id': 'RR951P', 'video_id': '7YP51D', 'interaction_type': 'skip', 'watch_time': 157, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '2438-26-4753', 'user_id': 'ZC342Y', 'video_id': '0HP06X', 'interaction_type': 'view', 'watch_time': 98, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '6795-70-8245', 'user_id': 'AL852G', 'video_id': '0TD02F', 'interaction_type': 'skip', 'watch_time': 141, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '1930-44-8804', 'user_id': 'IA528X', 'video_id': '8KA77T', 'interaction_type': 'like', 'watch_time': 22, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '1584-76-4537', 'user_id': 'VZ873S', 'video_id': '6MY59E', 'interaction_type': 'skip', 'watch_time': 74, 'interaction_date': '2024-04-18 11:33:30'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending messages: 90%|█████████ | 9/10 [00:00<00:00, 21.51it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'interaction_id': '8360-83-9259', 'user_id': 'LF197M', 'video_id': '8GS13E', 'interaction_type': 'skip', 'watch_time': 85, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '8936-72-7250', 'user_id': 'CW504L', 'video_id': '0KS93V', 'interaction_type': 'skip', 'watch_time': 40, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '7234-31-3381', 'user_id': 'OP675X', 'video_id': '9YA05J', 'interaction_type': 'skip', 'watch_time': 35, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '4106-34-5958', 'user_id': 'RF538A', 'video_id': '9HW87T', 'interaction_type': 'skip', 'watch_time': 43, 'interaction_date': '2024-04-18 11:33:30'}\n", + "{'interaction_id': '1759-83-8126', 'user_id': 'SG073W', 'video_id': '5FH42R', 'interaction_type': 'like', 'watch_time': 64, 'interaction_date': '2024-04-18 11:33:30'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending messages: 100%|██████████| 10/10 [00:00<00:00, 21.66it/s]\n" + ] + } + ], + "source": [ + "# Send to source topic\n", + "for interaction in tqdm(interactions_data[:10], desc=\"Sending messages\"):\n", + " print(interaction)\n", + " producer.produce(\n", + " config.KAFKA_TOPIC_NAME,\n", + " json.dumps(interaction),\n", + " )\n", + " producer.flush()" + ] + }, + { + "cell_type": "markdown", + "id": "29fff3cd", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/utils/hsfs_bytewax.py b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/utils/hsfs_bytewax.py new file mode 100644 index 00000000..822d8805 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/Jupyter/streaming/utils/hsfs_bytewax.py @@ -0,0 +1,60 @@ +from io import BytesIO +from hsfs import engine +from bytewax.connectors.kafka import KafkaSinkMessage + +def _get_feature_group_config(feature_group): + """ + fetches configuration for feature group online topic + :param feature_group: + :return: + """ + + if feature_group._kafka_producer is None: + offline_write_options = {} # {'internal_kafka': True} + producer, feature_writers, writer = engine.get_instance()._init_kafka_resources( + feature_group, offline_write_options + ) + feature_group._kafka_producer = producer + feature_group._feature_writers = feature_writers + feature_group._writer = writer + + return feature_group + + +def serialize_with_key(key_payload, feature_group): + key, row = key_payload + + feature_group = _get_feature_group_config(feature_group) + + # encode complex features + row = engine.get_instance()._encode_complex_features(feature_group._feature_writers, row) + + # encode feature row + with BytesIO() as outf: + feature_group._writer(row, outf) + encoded_row = outf.getvalue() + + # assemble key + key = "".join([str(row[pk]) for pk in sorted(feature_group.primary_key)]) + + return key, encoded_row + + +def sink_kafka(key, value, feature_group): # -> KafkaSinkMessage[Dict, Dict]: + + # encode complex features + headers = [ + ("projectId", str(feature_group.feature_store.project_id).encode("utf8")), + ("featureGroupId", str(feature_group._id).encode("utf8")), + ("subjectId", str(feature_group.subject["id"]).encode("utf8")) + ] + + return KafkaSinkMessage( + headers=headers, # List[Tuple[str, bytes]] = field(default_factory=list) + key=str({"identifier": key, "name": feature_group._online_topic_name}).encode('utf-8'), + value=value, + ) + + +def get_kafka_config(feature_store_id): + return engine.get_instance()._get_kafka_config(feature_store_id) diff --git a/advanced_tutorials/tiktok_recsys/python/requirements.txt b/advanced_tutorials/tiktok_recsys/python/requirements.txt new file mode 100644 index 00000000..4ebca888 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/requirements.txt @@ -0,0 +1,6 @@ +mimesis==15.1.0 +tensorflow==2.13 +tensorflow-recommenders==0.7.2 +catboost==1.1.1 +hopsworks==3.7.6 +mimesis==15.1.0 diff --git a/advanced_tutorials/tiktok_recsys/python/setup/interactions_topic.py b/advanced_tutorials/tiktok_recsys/python/setup/interactions_topic.py new file mode 100644 index 00000000..a00910fc --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/setup/interactions_topic.py @@ -0,0 +1,80 @@ +import hopsworks + +project = hopsworks.login() + +# create kafka topic +KAFKA_TOPIC_NAME = "live_interactions" +SCHEMA_NAME = "live_interactions_schema" + +kafka_api = project.get_kafka_api() +job_api = project.get_jobs_api() + +schema = { + "type": "record", + "name": SCHEMA_NAME, + "namespace": "io.hops.examples.flink.examples", + "fields": [ + { + "name": "interaction_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "user_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "video_id", + "type": [ + "null", + "string" + ] + }, + { + "name": "category_id", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_type", + "type": [ + "null", + "string" + ] + }, + { + "name": "watch_time", + "type": [ + "null", + "long" + ] + }, + { + "name": "interaction_date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ] + }, + { + "name": "interaction_month", + "type": [ + "null", + "string" + ] + } + ] +} + +kafka_api.create_schema(SCHEMA_NAME, schema) +kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=20) diff --git a/advanced_tutorials/tiktok_recsys/python/setup/tiktok_interactions_feature_group.py b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_interactions_feature_group.py new file mode 100644 index 00000000..ed02e994 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_interactions_feature_group.py @@ -0,0 +1,48 @@ +# Setup the feature groups for the Flink pipelines +import pandas as pd +import hopsworks +from hsfs.feature import Feature +from datetime import datetime, timedelta, timezone + +project = hopsworks.login() +fs = project.get_feature_store() + +features = [ + Feature(name="interaction_month", type="string"), + Feature(name="id", type="bigint"), + Feature(name="user_id", type="bigint"), + Feature(name="video_id", type="bigint"), + Feature(name="category_id", type="bigint"), + Feature(name="interaction_type", type="string"), + Feature(name="watch_time", type="bigint"), + Feature(name="interaction_date", type="timestamp"), +] + +interactions_fg = fs.get_or_create_feature_group( + name="interactions", + description="Interactions data.", + version=1, + primary_key=["id"], + partition_key=["interaction_month"], + online_enabled=True, + event_time="interaction_date" + +) + +interactions_fg.save(features) + +feature_descriptions = [ + {"name": "id", "description": "Unique id for the interaction"}, + {"name": "user_id", "description": "Unique identifier for each user."}, + {"name": "video_id", "description": "Identifier for the video."}, + {"name": "category_id", "description": "Id of the video category."}, + {"name": "interaction_type", "description": "Type of interaction"}, + {"name": "watch_time", "description": "Time in seconds how long user watched the video."}, + {"name": "interaction_date", "description": "Date of inteaction."}, + {"name": "interaction_month", "description": "Month of interaction, derived from interaction_date."} +] + +for desc in feature_descriptions: + interactions_fg.update_feature_description(desc["name"], desc["description"]) + +# interactions_fg.materialization_job.schedule(cron_expression="0 */15 * ? * *", start_time=datetime.now(tz=timezone.utc)) diff --git a/advanced_tutorials/tiktok_recsys/python/setup/tiktok_user_window_agg_feature_group.py b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_user_window_agg_feature_group.py new file mode 100644 index 00000000..faa424ad --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_user_window_agg_feature_group.py @@ -0,0 +1,64 @@ +import hopsworks + +from hsfs.feature import Feature +from datetime import datetime, timedelta, timezone + +project = hopsworks.login() +fs = project.get_feature_store() + +features = [ + Feature(name="user_id", type="bigint"), + Feature(name="category_id", type="bigint"), + + Feature(name="like_count", type="bigint"), + Feature(name="dislike_count", type="bigint"), + Feature(name="view_count", type="bigint"), + Feature(name="comment_count", type="bigint"), + Feature(name="share_count", type="bigint"), + Feature(name="skip_count", type="bigint"), + Feature(name="total_watch_time", type="bigint"), + + Feature(name="interaction_month", type="string"), + Feature(name="window_end_time", type="timestamp"), +] + +user_window_agg_1h_fg = fs.create_feature_group( + "user_window_agg_1h", + version=1, + statistics_config=False, + primary_key=["user_id"], + partition_key=["interaction_month"], + event_time="window_end_time", + online_enabled=True, + stream=True, +) + +user_window_agg_1h_fg.save(features) + + +feature_descriptions = [ + {"name": "user_id", "description": "Unique identifier for each user."}, + {"name": "category_id", "description": "Id of the video category."}, + {"name": "window_end_time", "description": "End of the specified time window where interaction were aggregated."}, + {"name": "interaction_month", + "description": "Month of the end of the specified time window where interaction were aggregated. Derived from window_end_time"}, + {"name": "like_count", + "description": "Number of likes video category got from the user during a specified time window."}, + {"name": "dislike_count", + "description": "Number of dislikes video category got from the user during a specified time window."}, + {"name": "view_count", + "description": "Number of views over video category got from the user during a specified time window."}, + {"name": "comment_count", + "description": "Number of comments video category got from the user during a specified time window."}, + {"name": "share_count", + "description": "Number of likes over video category got from the user during a specified time window."}, + {"name": "skip_count", + "description": "Number of times video category was skiped by the user during a specified time window."}, + {"name": "total_watch_time", + "description": "Total time in seconds video category was watched by the user during a specified time window."}, +] + +for desc in feature_descriptions: + user_window_agg_1h_fg.update_feature_description(desc["name"], desc["description"]) + +# user_window_agg_1h_fg.materialization_job.schedule(cron_expression="0 */15 * ? * *", start_time=datetime.now(tz=timezone.utc)) diff --git a/advanced_tutorials/tiktok_recsys/python/setup/tiktok_video_window_agg_feature_group.py b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_video_window_agg_feature_group.py new file mode 100644 index 00000000..cb881ac0 --- /dev/null +++ b/advanced_tutorials/tiktok_recsys/python/setup/tiktok_video_window_agg_feature_group.py @@ -0,0 +1,57 @@ +import hopsworks + +from hsfs.feature import Feature +from datetime import datetime, timedelta, timezone + +project = hopsworks.login() +fs = project.get_feature_store() + +features = [ + Feature(name="video_id", type="bigint"), + Feature(name="category_id", type="bigint"), + + Feature(name="like_count", type="bigint"), + Feature(name="dislike_count", type="bigint"), + Feature(name="view_count", type="bigint"), + Feature(name="comment_count", type="bigint"), + Feature(name="share_count", type="bigint"), + Feature(name="skip_count", type="bigint"), + Feature(name="total_watch_time", type="bigint"), + + Feature(name="interaction_month", type="string"), + Feature(name="window_end_time", type="timestamp"), +] + +video_window_agg_1h_fg = fs.create_feature_group( + "video_window_agg_1h", + version=1, + statistics_config=False, + primary_key=["video_id"], + partition_key=["interaction_month"], + event_time="window_end_time", + online_enabled=True, + stream=True, +) + +video_window_agg_1h_fg.save(features) + +feature_descriptions = [ + {"name": "video_id", "description": "Identifier for the video."}, + {"name": "category_id", "description": "Id of the video category."}, + {"name": "window_end_time", "description": "End of the specified time window where interaction were aggregated."}, + {"name": "interaction_month", + "description": "Month of the end of the specified time window where interaction were aggregated. Derived from window_end_time"}, + {"name": "like_count", "description": "Number of likes video got over a specified time window."}, + {"name": "dislike_count", "description": "Number of dislikes video got over a specified time window."}, + {"name": "view_count", "description": "Number of views video got over a specified time window."}, + {"name": "comment_count", "description": "Number of comments video got over a specified time window."}, + {"name": "share_count", "description": "Number of likes over got over a specified time window."}, + {"name": "skip_count", "description": "Number of times video was skiped over a specified time window."}, + {"name": "total_watch_time", + "description": "Total time in seconds video was watched over a specified time window."}, +] + +for desc in feature_descriptions: + video_window_agg_1h_fg.update_feature_description(desc["name"], desc["description"]) + +# video_window_agg_1h_fg.materialization_job.schedule(cron_expression="0 */15 * ? * *", start_time=datetime.now(tz=timezone.utc))