-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Runs cleanfile but no results output
- Loading branch information
1 parent
8199626
commit 9e51841
Showing
15 changed files
with
1,818 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import datetime | ||
import logging | ||
import os | ||
from pathlib import Path | ||
import yaml | ||
from string import Template | ||
|
||
from liiatools.spec import common as common_asset_dir | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
COMMON_CONFIG_DIR = Path(common_asset_dir.__file__).parent | ||
|
||
|
||
class Config(dict): | ||
def __init__(self, *config_files): | ||
super().__init__() | ||
|
||
if not config_files: | ||
config_files = [ | ||
"DEFAULT_DATA_CODES", | ||
] | ||
|
||
for file in config_files: | ||
if file == "DEFAULT_DATA_CODES": | ||
file = COMMON_CONFIG_DIR / "LA-codes.yml" | ||
self.load_config(file, conditional=False) | ||
|
||
self["config_date"] = datetime.datetime.now().isoformat() | ||
try: | ||
self["username"] = os.getlogin() | ||
except OSError: | ||
# This happens when tests are not run under a login shell, e.g. CI pipeline | ||
pass | ||
|
||
def load_config(self, filename, conditional=False, warn=False): | ||
""" | ||
Load configuration from yaml file. Any loaded configuration | ||
is only set if the values don't already exist in CONFIG. | ||
Files can contain ${} placeholders following the Python string.Template format. | ||
The context will include any keys already existing in the configuration, any keys | ||
from the current file - however, if these include placeholders, the placeholders | ||
will not be replaced. Finally, environment variables can be referenced with | ||
`os_environ_VARIABLE_NAME`. | ||
Keyword arguments: | ||
filename -- Filename to load from | ||
conditional -- If True, ignore file if it doesn't exist. If False, fail. (default False) | ||
""" | ||
if conditional and not os.path.isfile(filename): | ||
if warn: | ||
log.warning("Missing optional file {}".format(filename)) | ||
|
||
return | ||
|
||
with open(filename) as FILE: | ||
user_config = yaml.load(FILE, Loader=yaml.FullLoader) | ||
|
||
log.info( | ||
"Loading {} configuration values from '{}'.".format( | ||
len(user_config), filename | ||
) | ||
) | ||
|
||
environment_dict = {"os_environ_{}".format(k): v for k, v in os.environ.items()} | ||
|
||
variables = dict(self) | ||
variables.update(user_config) | ||
variables.update(environment_dict) | ||
|
||
with open(filename, "rt") as FILE: | ||
user_config_string = FILE.read() | ||
|
||
user_config_template = Template(user_config_string) | ||
user_config_string = user_config_template.substitute(variables) | ||
|
||
user_config = yaml.load(user_config_string, Loader=yaml.FullLoader) | ||
|
||
self.update(user_config) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from sfdata_stream_parser.checks import type_check | ||
from sfdata_stream_parser import events | ||
from sfdata_stream_parser.filters.generic import streamfilter, pass_event | ||
|
||
|
||
@streamfilter(check=type_check(events.TextNode), fail_function=pass_event) | ||
def convert_true_false(event): | ||
""" | ||
Search for any events that have the schema type="yesnotype" and convert any values of false to 0 and true to 1 | ||
:param event: A filtered list of event objects | ||
:return: An updated list of event objects | ||
""" | ||
if hasattr(event, "schema"): | ||
if event.schema.type.name == "yesnotype": | ||
if event.text.lower() == "false": | ||
event = event.from_event(event, text="0") | ||
elif event.text.lower() == "true": | ||
event = event.from_event(event, text="1") | ||
return event |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from pathlib import Path | ||
import pandas as pd | ||
import logging | ||
|
||
from liiatools.datasets.shared_functions import converters, common | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
def convert_to_dataframe(data): | ||
data = data.export("df") | ||
return data | ||
|
||
|
||
def get_year(data, year): | ||
data["YEAR"] = year | ||
return data | ||
|
||
|
||
def convert_to_datetime(data): | ||
data[["PersonBirthDate"]] = data[ | ||
["PersonBirthDate"] | ||
].apply(pd.to_datetime) | ||
return data | ||
|
||
|
||
def _get_person_school_year(datevalue): | ||
if datevalue.month >= 9: | ||
school_year = datevalue.year | ||
elif datevalue.month <= 8: | ||
school_year = datevalue.year - 1 | ||
else: | ||
school_year = None | ||
return school_year | ||
|
||
|
||
def add_school_year(data): | ||
data["PersonSchoolYear"] = data["PersonBirthDate"].apply( | ||
lambda row: _get_person_school_year(row) | ||
) | ||
return data | ||
|
||
|
||
def add_la_name(data, la_name): | ||
data["LA"] = la_name | ||
return data | ||
|
||
|
||
def la_prefix(data, la_code): | ||
data["Surname"] = data["Surname"] + "_" + la_code | ||
return data | ||
|
||
|
||
def degrade_dob(data): | ||
if data["PersonBirthDate"] is not None: | ||
data["PersonBirthDate"] = data["PersonBirthDate"].apply( | ||
lambda row: converters.to_month_only_dob(row) | ||
) | ||
return data | ||
|
||
|
||
def add_fields(input_year, data, la_name, la_code): | ||
""" | ||
Add YEAR, LA, PERSONSCHOOLYEAR to exported dataframe | ||
Append LA_code from config to LAChildID | ||
:param input_year: A string of the year of return for the current file | ||
:param data: The dataframe to be cleaned | ||
:param la_name: LA name | ||
:param la_code: LA code | ||
:return: Cleaned and degraded dataframe | ||
""" | ||
data = convert_to_dataframe(data) | ||
data = get_year(data, input_year) | ||
data = convert_to_datetime(data) | ||
data = add_school_year(data) | ||
data = add_la_name(data, la_name) | ||
data = la_prefix(data, la_code) | ||
data = degrade_dob(data) | ||
return data | ||
|
||
|
||
def export_file(input, output, data): | ||
filename = Path(input).stem | ||
outfile = filename + "_clean.csv" | ||
output_path = Path(output, outfile) | ||
data.to_csv(output_path, index=False) |
Oops, something went wrong.