diff --git a/graphreduce/__main__.py b/graphreduce/__main__.py new file mode 100644 index 0000000..7cae889 --- /dev/null +++ b/graphreduce/__main__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +import sys + +import typer + +from .cli.entry_point import entrypoint_cli + + + + +def main(): + + if sys.version_info[:3] == (3, 8): + pass + + + try: + entrypoint_cli() + except Exception as exc: + tb = exc.__cause__.__traceback__ + print(tb) + + +if __name__ == '__main__': + main() diff --git a/graphreduce/cli/__init__.py b/graphreduce/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/graphreduce/cli/auto_fe.py b/graphreduce/cli/auto_fe.py new file mode 100644 index 0000000..ea233ff --- /dev/null +++ b/graphreduce/cli/auto_fe.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +import sqlite3 +import json +import os +import typing +import datetime + +import typer +from typer import Argument, Option +import pandas as pd + + +# examples for using SQL engines and dialects +from graphreduce.node import SQLNode, DynamicNode +from graphreduce.graph_reduce import GraphReduce +from graphreduce.enum import SQLOpType, ComputeLayerEnum, PeriodUnit +from graphreduce.models import sqlop +from graphreduce.context import method_requires + + +auto_fe_cli = typer.Typer(name="auto_fe", help="Perform automated feature engineering", no_args_is_help=True) + + + + +@auto_fe_cli.command("autofefs") +def autofe_filesystem ( + # directory or sqlite db + data_path: str = Argument(help="Path to data"), + # 'csv', 'parquet', etc. + fmt: str = Argument(help="File format"), + # {fname: 'prefix} + prefixes: str = Argument(help="json dict of filenames with prefixes (e.g., `{'test.csv':'test'}`)"), + # {fname: 'ts'} + date_keys: str = Argument(help="json dict of filenames with associated date key (e.g., `{'test.csv': 'ts'}`)"), + # [ {'from_node': 'fname', 'from_key', 'to_node': 'fname', 'to_key': key', 'reduce':True} ] + relationships: str = Argument( + help="json of relationships (e.g., `[{'from_node':'fname', 'from_key':'cust_id', 'to_node':'tname', 'to_key'}]`)"), + parent_node: str = Argument( + help="parent/root node to which to aggregate all of the data" + ), + cut_date: str = Argument(str(datetime.datetime.today())), + # 'pandas', 'dask', 'sql' + compute_layer: str = Argument("pandas"), + hops_front: int = Argument(1), + hops_back: int = Argument(3), + output_path: str = Option('-op', '--output-path', help='output path for the data') + ): + """ +Main automated feature engineering function. + """ + + prefixes = json.loads(prefixes) + date_keys = json.loads(date_keys) + relationships = json.loads(relationships) + + nodes = {} + if fmt in ['csv', 'parquet', 'delta', 'iceberg']: + for f in os.listdir(data_path): + print(f"adding file {f}") + nodes[f] = DynamicNode( + fpath=f"{data_path}/{f}", + fmt=f.split('.')[1], + prefix=prefixes.get(f), + compute_layer=getattr(ComputeLayerEnum, compute_layer), + date_key=date_keys.get(f, None) + ) + + gr = GraphReduce( + name='autofe', + parent_node=nodes[parent_node], + fmt=fmt, + cut_date=datetime.datetime.now(), + compute_layer=getattr(ComputeLayerEnum, compute_layer), + auto_features=True, + auto_feature_hops_front=hops_front, + auto_feature_hops_back=hops_back + ) + + for rel in relationships: + gr.add_entity_edge( + parent_node=nodes[rel['to_node']], + parent_key=rel['to_key'], + relation_node=nodes[rel['from_node']], + relation_key=rel['from_key'], + reduce=rel.get('reduce', True) + ) + + gr.do_transformations() + if not output_path: + output_path = os.path.join( + os.path.expanduser("~"), + "graphreduce_outputs/test.csv" + ) + + getattr(gr.parent_node.df, f"to_{fmt}")(output_path) diff --git a/graphreduce/cli/entry_point.py b/graphreduce/cli/entry_point.py new file mode 100644 index 0000000..b180a93 --- /dev/null +++ b/graphreduce/cli/entry_point.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import typing + +import typer + + +from .auto_fe import auto_fe_cli + + +entrypoint_cli_typer = typer.Typer( + no_args_is_help=True, + add_completion=False, + rich_markup_mode="markdown", + help=""" + See examples at https://github.com/wesmadrigal/graphreduce + """ +) + +# Automated feature engineering +entrypoint_cli_typer.add_typer(auto_fe_cli, rich_help_panel="autofe") + + +entrypoint_cli = typer.main.get_command(entrypoint_cli_typer) +entrypoint_cli.list_commands(None) + + +if __name__ == '__main__': + entrypoint_cli() diff --git a/setup.py b/setup.py index 027d622..dc53ea7 100644 --- a/setup.py +++ b/setup.py @@ -38,11 +38,9 @@ author="Wes Madrigal", author_email="wes@madconsulting.ai", license="MIT", - description="Leveraging graph data structures for complex feature engineering pipelines.", long_description = pathlib.Path("README.md").read_text(), long_description_content_type = "text/markdown", - keywords = ", ".join(KEYWORDS), classifiers = [ "Programming Language :: Python :: 3", @@ -60,6 +58,6 @@ "Source" : "http://github.com/wesmadrigal/graphreduce", "Issue Tracker" : "https://github.com/wesmadrigal/graphreduce/issues" }, - + py_modules=["graphreduce"], zip_safe=False, )