@@ -71,7 +71,7 @@ def _add_validation_metadata(failed_check_fields_df: pl.DataFrame, check: SBLChe
71
71
72
72
73
73
def validate (
74
- schema : pa .DataFrameSchema , submission_df : pl .DataFrame , row_start : int
74
+ schema : pa .DataFrameSchema , submission_df : pl .LazyFrame , row_start : int , process_errors : bool
75
75
) -> pl .DataFrame :
76
76
"""
77
77
validate received dataframe with schema and return list of
@@ -95,7 +95,7 @@ def validate(
95
95
# `list[dict[str,Any]]`, but it's actually of type `SchemaError`
96
96
schema_error : SchemaError
97
97
98
- #if process_errors:
98
+ # if process_errors:
99
99
for schema_error in err .schema_errors :
100
100
check = schema_error .check
101
101
column_name = schema_error .schema .name
@@ -116,17 +116,22 @@ def validate(
116
116
f'Check { check } type on { column_name } column not supported. Must be of type { SBLCheck } '
117
117
) from schema_error
118
118
119
- schema_error = gather_errors (schema_error )
119
+ # schema_error = gather_errors(schema_error)
120
+ # check_output: pl.Series = gather_errors(schema_error)
120
121
121
122
fields = _get_check_fields (check , column_name )
122
- check_output : pl . Series | None = schema_error .check_output
123
+ check_output = gather_check_errors ( schema_error .check_output )
123
124
124
125
if check_output is not None :
125
126
# Filter data not associated with failed Check, and update index for merging with findings_df
126
127
check_output = check_output .with_columns (pl .col ('index' ).add (row_start ))
127
- failed_records_df = _filter_valid_records (submission_df , check_output , fields )
128
- failed_record_fields_df = _records_to_fields (failed_records_df )
129
- findings = _add_validation_metadata (failed_record_fields_df , check )
128
+ if process_errors :
129
+ failed_records_df = _filter_valid_records (submission_df , check_output , fields )
130
+ failed_record_fields_df = _records_to_fields (failed_records_df )
131
+ findings = _add_validation_metadata (failed_record_fields_df , check )
132
+ else :
133
+ findings = _add_validation_metadata (check_output , check )
134
+ findings = findings .with_columns (pl .lit (check .scope ).alias ("scope" ), pl .lit (check .severity .value ).alias ("validation_type" ))
130
135
check_findings .append (findings )
131
136
else :
132
137
# The above exception handling _should_ prevent this from ever happenin, but...just in case.
@@ -136,8 +141,9 @@ def validate(
136
141
if check_findings :
137
142
findings_df = pl .concat (check_findings )
138
143
139
- updated_df = add_uid (findings_df , submission_df , row_start )
140
- return updated_df
144
+ return add_uid (findings_df , submission_df , row_start ) if process_errors else findings_df
145
+ # updated_df = add_uid(findings_df, submission_df, row_start)
146
+ # return updated_df
141
147
142
148
143
149
# Add the uid for the record throwing the error/warning to the error dataframe
@@ -182,7 +188,7 @@ def validate_batch_csv(
182
188
183
189
if not has_syntax_errors :
184
190
register_schema = get_register_schema (context )
185
- validation_results = validate (register_schema , pl .DataFrame ({"uid" : all_uids }), 0 )
191
+ validation_results = validate (register_schema , pl .DataFrame ({"uid" : all_uids }), 0 , True )
186
192
if not validation_results .is_empty ():
187
193
validation_results = format_findings (
188
194
validation_results ,
@@ -199,7 +205,6 @@ def validate_batch_csv(
199
205
)
200
206
yield results
201
207
202
- print ("Processing other logic errors" )
203
208
for validation_results , _ in validate_chunks (
204
209
logic_schema , real_path , batch_size , batch_count , max_errors , logic_checks
205
210
):
@@ -222,7 +227,7 @@ def validate_chunks(schema, path, batch_size, batch_count, max_errors, checks):
222
227
row_start = 0
223
228
while batches :
224
229
df = pl .concat (batches )
225
- validation_results = validate (schema , df , row_start )
230
+ validation_results = validate (schema , df , row_start , process_errors )
226
231
if not validation_results .is_empty ():
227
232
228
233
validation_results = format_findings (validation_results , schema .name .value , checks )
@@ -235,9 +240,9 @@ def validate_chunks(schema, path, batch_size, batch_count, max_errors, checks):
235
240
findings = validation_results if process_errors else pl .DataFrame (),
236
241
phase = schema .name ,
237
242
)
238
- print (f"Findings height: { validation_results .height } " , flush = True )
243
+ # print(f"Findings height: {validation_results.height}", flush=True)
239
244
total_count += (error_counts .total_count + warning_counts .total_count )
240
- print (f"Total Count: { total_count } " , flush = True )
245
+ # print(f"Total Count: {total_count}", flush=True)
241
246
if total_count > max_errors and process_errors :
242
247
process_errors = False
243
248
head_count = results .findings .height - (total_count - max_errors )
@@ -284,9 +289,9 @@ def validate_lazy_frame(
284
289
285
290
286
291
def validate_register_level (context : Dict [str , str ] | None , all_uids : List [str ]):
287
- print ("Processing register logic errors" )
292
+ # print("Processing register logic errors")
288
293
register_schema = get_register_schema (context )
289
- validation_results = validate (register_schema , pl .DataFrame ({"uid" : all_uids }), 0 )
294
+ validation_results = validate (register_schema , pl .DataFrame ({"uid" : all_uids }), 0 , True )
290
295
if not validation_results .is_empty ():
291
296
validation_results = format_findings (
292
297
validation_results ,
@@ -301,14 +306,14 @@ def validate_register_level(context: Dict[str, str] | None, all_uids: List[str])
301
306
findings = validation_results ,
302
307
phase = register_schema .name ,
303
308
)
304
- print (f"Register counts: { error_counts } { warning_counts } " , flush = True )
309
+ # print(f"Register counts: {error_counts} {warning_counts}", flush=True)
305
310
return results
306
311
307
312
308
313
def validate_chunk (schema , df , total_count , row_start , max_errors , process_errors , checks ):
309
- print (f"Start UID: { df ['uid' ][0 ]} , Last UID: { df ['uid' ][- 1 ]} " , flush = True )
310
- validation_results = validate (schema , df , row_start )
311
- if not validation_results .is_empty ():
314
+ # print(f"Start UID: {df['uid'][0]}, Last UID: {df['uid'][-1]}", flush=True)
315
+ validation_results = validate (schema , df , row_start , process_errors )
316
+ if process_errors and not validation_results .is_empty ():
312
317
validation_results = format_findings (
313
318
validation_results , schema .name .value , checks
314
319
)
@@ -323,19 +328,19 @@ def validate_chunk(schema, df, total_count, row_start, max_errors, process_error
323
328
)
324
329
325
330
total_count += (error_counts .total_count + warning_counts .total_count )
326
- print (f"Counts: { error_counts } { warning_counts } " , flush = True )
331
+ # print(f"Counts: {error_counts} {warning_counts}", flush=True)
327
332
if total_count > max_errors and process_errors :
328
- print ("Reached max errors, adjusting results" , flush = True )
333
+ # print("Reached max errors, adjusting results", flush=True)
329
334
process_errors = False
330
335
head_count = results .findings .height - (total_count - max_errors )
331
- print (f"Results height: { results .findings .height } , total count: { total_count } , head count: { head_count } " , flush = True )
336
+ # print(f"Results height: {results.findings.height}, total count: {total_count}, head count: {head_count}", flush=True)
332
337
results .findings = results .findings .head (head_count )
333
- print (f"Results height after heading { results .findings .height } " , flush = True )
338
+ # print(f"Results height after heading {results.findings.height}", flush=True)
334
339
335
- if not results .findings .is_empty ():
336
- result = results .findings .group_by ("validation_id" ).agg ([pl .count ().alias ("count" )]).sort ("validation_id" )
337
- result_dict = dict (zip (result ["validation_id" ], result ["count" ]))
338
- print (f"{ result_dict } \n Total Results: { results .findings .height } " , flush = True )
340
+ # if not results.findings.is_empty():
341
+ # result = results.findings.group_by("validation_id").agg([pl.count().alias("count")]).sort("validation_id")
342
+ # result_dict = dict(zip(result["validation_id"], result["count"]))
343
+ # print(f"{result_dict}\nTotal Results: {results.findings.height}", flush=True)
339
344
340
345
341
346
return results , total_count , process_errors
@@ -364,6 +369,9 @@ def get_real_file_path(path):
364
369
return f .name
365
370
return path
366
371
372
+ def gather_check_errors (check_output : pl .DataFrame ):
373
+ return check_output .with_row_index ().filter (~ pl .col ("check_output" ))
374
+
367
375
368
376
# This function adds an index column (polars dataframes do not normally have one), and filters out
369
377
# any row that did not fail a check.
0 commit comments