@@ -269,15 +269,13 @@ def offline_fg_materialization(
269
269
270
270
read_options = engine .get_instance ()._get_kafka_config (entity .feature_store_id , {})
271
271
272
- # get offsets
272
+ # get starting offsets
273
273
offset_location = entity .prepare_spark_location () + "/kafka_offsets"
274
274
try :
275
275
if initial_check_point_string :
276
- offset_string = json .dumps (
277
- _build_starting_offsets (initial_check_point_string )
278
- )
276
+ starting_offset_string = json .dumps (_build_offsets (initial_check_point_string ))
279
277
else :
280
- offset_string = spark .read .json (offset_location ).toJSON ().first ()
278
+ starting_offset_string = spark .read .json (offset_location ).toJSON ().first ()
281
279
except Exception as e :
282
280
print (f"An unexpected error occurred: { e } " )
283
281
# if all else fails read from the beggining
@@ -286,15 +284,26 @@ def offline_fg_materialization(
286
284
offline_write_options = {},
287
285
high = False ,
288
286
)
289
- offset_string = json .dumps (_build_starting_offsets (initial_check_point_string ))
290
- print (f"startingOffsets: { offset_string } " )
287
+ starting_offset_string = json .dumps (_build_offsets (initial_check_point_string ))
288
+ print (f"startingOffsets: { starting_offset_string } " )
289
+
290
+ # get ending offsets
291
+ ending_offset_string = kafka_engine .kafka_get_offsets (
292
+ topic_name = entity ._online_topic_name ,
293
+ feature_store_id = entity .feature_store_id ,
294
+ offline_write_options = {},
295
+ high = True ,
296
+ )
297
+ ending_offset_string = json .dumps (_build_offsets (ending_offset_string ))
298
+ print (f"endingOffsets: { ending_offset_string } " )
291
299
292
300
# read kafka topic
293
301
df = (
294
302
spark .read .format ("kafka" )
295
303
.options (** read_options )
296
304
.option ("subscribe" , entity ._online_topic_name )
297
- .option ("startingOffsets" , offset_string )
305
+ .option ("startingOffsets" , starting_offset_string )
306
+ .option ("endingOffsets" , ending_offset_string )
298
307
.option ("includeHeaders" , "true" )
299
308
.option ("failOnDataLoss" , "false" )
300
309
.load ()
@@ -326,8 +335,8 @@ def offline_fg_materialization(
326
335
entity .insert (deserialized_df , storage = "offline" )
327
336
328
337
# update offsets
329
- df_offsets = (df if limit > filtered_df .count () else filtered_df ).groupBy (" partition" ).agg (max (" offset" ).alias (" offset" )).collect ()
330
- offset_dict = json .loads (offset_string )
338
+ df_offsets = (df if limit > filtered_df .count () else filtered_df ).groupBy (' partition' ).agg (max (' offset' ).alias (' offset' )).collect ()
339
+ offset_dict = json .loads (starting_offset_string )
331
340
for offset_row in df_offsets :
332
341
offset_dict [entity ._online_topic_name ][f"{ offset_row .partition } " ] = (
333
342
offset_row .offset + 1
@@ -350,8 +359,7 @@ def update_table_schema_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> Non
350
359
entity .stream = False
351
360
engine .get_instance ().update_table_schema (entity )
352
361
353
-
354
- def _build_starting_offsets (initial_check_point_string : str ):
362
+ def _build_offsets (initial_check_point_string : str ):
355
363
if not initial_check_point_string :
356
364
return ""
357
365
0 commit comments