Skip to content

Commit

Permalink
add blank row error
Browse files Browse the repository at this point in the history
  • Loading branch information
patrick-troy committed Feb 9, 2024
1 parent d2b5bd2 commit d4a7cee
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 25 deletions.
12 changes: 11 additions & 1 deletion liiatools/common/stream_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,24 @@ def add_table_name(event, schema: DataSchema):
if not headers:
table_name = None
else:
if all(header == "" for header in headers):
return EventErrors.add_to_event(
event,
type="BlankHeaders",
message=f"Could not identify headers as first row is blank",
)
table_name = schema.get_table_from_headers(event.headers)

if table_name:
return event.from_event(
event, table_name=table_name, table_spec=schema.column_map[table_name]
)
else:
return event
return EventErrors.add_to_event(
event,
type="UnidentifiedTable",
message=f"Failed to identify table based on headers",
)


@streamfilter(check=type_check(events.Cell), fail_function=pass_event)
Expand Down
165 changes: 141 additions & 24 deletions tests/common/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,37 +73,84 @@ def get_table_name(headers):
stream = [events.StartTable(headers=headers)]
stream = stream_filters.add_table_name(stream, schema=schema)
event = list(stream)[0]
return getattr(event, "table_name", None)
table_name = getattr(event, "table_name", None)
errors = getattr(event, "errors", None)
return {"table_name": table_name, "errors": errors}

assert (
get_table_name(["CHILD", "SEX", "DOB", "ETHNIC", "UPN", "MOTHER", "MC_DOB"])
get_table_name(["CHILD", "SEX", "DOB", "ETHNIC", "UPN", "MOTHER", "MC_DOB"])[
"table_name"
]
== "Header"
)

for table_name, table_data in schema.table.items():
headers = list(table_data.keys())
assert get_table_name(headers) == table_name

assert get_table_name(["incorrect", "header", "values"]) is None
assert get_table_name(headers)["table_name"] == table_name

assert get_table_name(["incorrect", "header", "values"])["table_name"] is None
assert list(get_table_name(["incorrect", "header", "values"])["errors"]) == [
{
"message": "Failed to identify table based on headers",
"type": "UnidentifiedTable",
}
]

assert get_table_name([""]) is None
assert get_table_name([""])["table_name"] is None
assert list(get_table_name([""])["errors"]) == [
{
"message": "Could not identify headers as first row is blank",
"type": "BlankHeaders",
}
]

assert get_table_name([]) is None
assert get_table_name([])["table_name"] is None
assert list(get_table_name([])["errors"]) == [
{
"message": "Failed to identify table based on headers",
"type": "UnidentifiedTable",
}
]

assert get_table_name(None) is None
assert get_table_name(None)["table_name"] is None
assert list(get_table_name(None)["errors"]) == [
{
"message": "Failed to identify table based on headers",
"type": "UnidentifiedTable",
}
]

schema = annex_a_schema()

assert (
get_table_name(["Child Unique ID", "Gender", "Ethnicity", "Date of Birth",
"Age of Child (Years)", "Date of Contact", "Contact Source"])
get_table_name(
[
"Child Unique ID",
"Gender",
"Ethnicity",
"Date of Birth",
"Age of Child (Years)",
"Date of Contact",
"Contact Source",
]
)["table_name"]
== "List 1"
)

assert (
get_table_name(["Child ID", "Gender", "Ethnicity", "Date Birth",
"Age", "Age", "Date of Contact", "Contact Source"])
== "List 1"
get_table_name(
[
"Child ID",
"Gender",
"Ethnicity",
"Date Birth",
"Age",
"Age",
"Date of Contact",
"Contact Source",
]
)["table_name"]
== "List 1"
)


Expand Down Expand Up @@ -134,11 +181,18 @@ def match_cell(**cell_properties):

schema = annex_a_schema()

assert match_cell(table_name="List 1", header="Child Unique ID").string == "alphanumeric"
assert (
match_cell(table_name="List 1", header="Child Unique ID").string
== "alphanumeric"
)

assert match_cell(table_name="List 1", header="Child Unique ID").header_regex == ["/.*child.*id.*/i"]
assert match_cell(table_name="List 1", header="Child Unique ID").header_regex == [
"/.*child.*id.*/i"
]

assert match_cell(table_name="List 1", header="Gender").category[0].code == "b) Female"
assert (
match_cell(table_name="List 1", header="Gender").category[0].code == "b) Female"
)


def assert_errors(event, *types):
Expand Down Expand Up @@ -241,13 +295,9 @@ def test_clean_categories():
{
"code": "b) Female",
"name": "F",
"cell_regex": ["/.*fem.*/i", "/b\).*/i"]
"cell_regex": ["/.*fem.*/i", "/b\).*/i"],
},
{
"code": "a) Male",
"name": "M",
"cell_regex": ["/^mal.*/i", "/a\).*/i"]
}
{"code": "a) Male", "name": "M", "cell_regex": ["/^mal.*/i", "/a\).*/i"]},
],
)

Expand All @@ -267,7 +317,7 @@ def test_clean_categories():
assert_errors(cleaned_event, "ConversionError")


def test_clean_integers():
def test_clean_numeric():
integer_spec = Column(numeric={"type": "integer"})
event = events.Cell(cell=123, column_spec=integer_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
Expand Down Expand Up @@ -299,8 +349,31 @@ def test_clean_integers():
assert cleaned_event.cell == ""
assert_errors(cleaned_event, "ConversionError")

float_spec = Column(
numeric={"type": "float", "min_value": 0, "max_value": 1, "decimal_places": 2}
)
event = events.Cell(cell=0.123, column_spec=float_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == 0.12
assert_errors(cleaned_event)

event = events.Cell(cell="0.2", column_spec=float_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == 0.2
assert_errors(cleaned_event)

event = events.Cell(cell="string", column_spec=float_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event, "ConversionError")

event = events.Cell(cell=-1, column_spec=float_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event, "ConversionError")


def test_conform_cell_types():
def test_clean_postcodes():
pc_spec = Column(string="postcode")
event = events.Cell(cell="G62 7PS", column_spec=pc_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
Expand Down Expand Up @@ -331,3 +404,47 @@ def test_conform_cell_types():
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event)


def test_clean_regex():
regex_spec = Column(string="regex", cell_regex=r"[A-Za-z]{2}\d{10}")
event = events.Cell(cell="AB1234567890", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == "AB1234567890"
assert_errors(cleaned_event)

event = events.Cell(cell=" AB1234567890 ", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == "AB1234567890"
assert_errors(cleaned_event)

event = events.Cell(cell="AB1234567890abcd", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event, "ConversionError")

event = events.Cell(cell="AB123", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event, "ConversionError")

event = events.Cell(cell="", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event)

event = events.Cell(cell=None, column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == ""
assert_errors(cleaned_event)

regex_spec = Column(string="regex", cell_regex=r"[A-Za-z]\d{11}(\d|[A-Za-z])")
event = events.Cell(cell="A123456789012", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == "A123456789012"
assert_errors(cleaned_event)

event = events.Cell(cell="A12345678901B", column_spec=regex_spec)
cleaned_event = list(stream_filters.conform_cell_types(event))[0]
assert cleaned_event.cell == "A12345678901B"
assert_errors(cleaned_event)

0 comments on commit d4a7cee

Please sign in to comment.