philiporlando · cnolanminich · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 13, 2024
diff --git a/.Rprofile b/.Rprofile
@@ -0,0 +1 @@
+source("renv/activate.R")
diff --git a/README.md b/README.md
@@ -20,16 +20,27 @@ To begin exploring the integration of Dagster and R:
    ```bash
    cd dagster-and-r
    ```
-3. **Install Dependencies**
-   Using [poetry](https://python-poetry.org/), install the package and its dependencies:
-   ```bash
-   poetry install
+3. **Install Python Dependencies**
+   # you'll need a version of python installed
+   Using uv
+   # install uv
+   # curl -LsSf https://astral.sh/uv/install.sh | sh
+   uv venv
+   source .venv/bin/activate
+   uv sync
    ```
+4. ** Install R dependencies**
+```
+   # from R
+   # if you haven't installed renv before
+   # install.packages("renv")
+   # renv::restore() 
+```
 
 4. **Set RETICULATE_PYTHON environment variable** 
 Determine the path to the python binary associated with this project's poetry environment.
    ```bash
-   poetry run
+   # from your viritual environment
    which python
    # /home/user/.cache/pypoetry/virtualenvs/dagster-and-r-kS5e8P_l-py3.10/bin/python
    ```
@@ -38,7 +49,7 @@ Create a new `.Renviron` file at the root of the project and set the `RETICULATE
 5. **Launch the Dagster UI**
    Start the Dagster web server:
    ```bash
-   poetry run dagster dev
+   dagster dev
    ```
    Access the UI at http://localhost:3000 in your browser.
 
@@ -61,7 +72,7 @@ Create a new `.Renviron` file at the root of the project and set the `RETICULATE
 Then, start the Dagster UI web server:
 
 ```bash
-poetry run dagster dev -m dagster_and_r
+dagster dev -m dagster_and_r
 ```
 
 Open http://localhost:3000 with your browser to see the project.
@@ -85,21 +96,21 @@ Open http://localhost:3000 with your browser to see the project.
 ### Adding Python Dependencies
 To add new Python packages to the project:
 ```bash
-poetry add <pkg-name>
+uv add <pkg-name>
 ```
 
 ### Unit Testing
 Unit tests are essential for ensuring code reliability and are currently being developed. Run existing tests using `pytest`:
 ```bash
-poetry run pytest dagster_and_r_tests
+pytest dagster_and_r_tests
 ```
 > [!NOTE]
 > Unit tests are a work in progress.
 
 ### Schedules and Sensors
 To enable [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) and [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors), ensure the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) is active:
 ```bash
-poetry run dagster dev
+dagster dev
 ```
 With the Daemon running, you can start using schedules and sensors for your jobs.
 

diff --git a/dagster_and_r/R/iris.R b/dagster_and_r/R/iris.R
@@ -1,3 +1,4 @@
+# make sure these packages are installed
 library(reticulate)
 library(readr)
 library(glue)
@@ -7,6 +8,36 @@ library(magrittr)
 reticulate::py_config()
 stopifnot(reticulate::py_module_available("dagster_pipes"))
 
+# Function to convert R types to Python types
+convert_r_to_python_types <- function(df) {
+    # Get R types
+    r_types <- sapply(df, class)
+
+    # Define type mapping
+    type_mapping <- list(
+        "numeric" = "float",
+        "integer" = "int",
+        "character" = "str",
+        "factor" = "str",
+        "logical" = "bool",
+        "Date" = "datetime.date",
+        "POSIXct" = "datetime.datetime",
+        "POSIXlt" = "datetime.datetime"
+    )
+
+    # Convert types
+    python_types <- sapply(r_types, function(x) {
+        if (x %in% names(type_mapping)) {
+            type_mapping[[x]]
+        } else {
+            "object"  # default type
+        }
+    })
+
+    return(reticulate::r_to_py(as.list(python_types)))
+}
+
+
 # Import Python modules
 # R doesn't support selective imports like Python, so you have to do this
 # to avoid typing the full namespace path repeatedly...
@@ -23,9 +54,20 @@ with(open_dagster_pipes() %as% pipes, {
     context$log$info(head(iris))
     context$log$info(os$environ["MY_ENV_VAR_IN_SUBPROCESS"])
     output_dir <- Sys.getenv("OUTPUT_DIR")
+    iris_head <- head(iris)
     context$log$info(glue::glue("output_dir: {output_dir}"))
-    context$report_asset_materialization() 
-
+    #python function to report back the materialization and metadata
+    context$report_custom_message(
+        payload = reticulate::r_to_py(list(
+        "dagster/row_count" = nrow(iris),
+         # if using report_asset_materialization 
+         #list( type = "md", "raw_value" = paste(knitr::kable(iris_head, format = "pipe"), collapse = "\n") ),
+        "preview" = paste(knitr::kable(iris_head, format = "pipe"), collapse = "\n"),
+        "iris_head_df" = reticulate::r_to_py(jsonlite::toJSON(x = iris_head, dataframe = "columns")),
+        "column_types" = convert_r_to_python_types(iris_head)
+    ))
+    )
+    context$log$info(glue::glue("got here!"))
     # Ensure that Sepal.Length field does not contain any NAs
     context$report_asset_check(
         asset_key="iris_r",

diff --git a/dagster_and_r/README.md b/dagster_and_r/README.md
diff --git a/dagster_and_r/__init__.py b/dagster_and_r/__init__.py
@@ -1,30 +0,0 @@
-from dagster import (
-    Definitions,
-    PipesSubprocessClient,
-    )
-from . jobs import docker_container_op_r
-from . asset_checks import (
-    # no_missing_sepal_length_check_r,
-    no_missing_sepal_length_check_py,
-    )
-
-# python_assets = load_assets_from_modules([assets])
-from . assets import (
-    hello_world_r,
-    iris_r,
-    iris_py,
-    )
-
-defs = Definitions(
-    assets=[
-        hello_world_r,
-        iris_r,
-        iris_py,
-        ],
-    asset_checks=[
-        # no_missing_sepal_length_check_r,
-        no_missing_sepal_length_check_py,
-        ],
-    jobs=[docker_container_op_r],
-    resources={"pipes_subprocess_client": PipesSubprocessClient()},
-)

diff --git a/dagster_and_r/assets.py b/dagster_and_r/assets.py
@@ -1,56 +1,90 @@
+import json
 import shutil
 import pandas as pd
+import dagster as dg 
+from dagster_pandas.data_frame import create_table_schema_metadata_from_dataframe
 
-from dagster import (
-    AssetExecutionContext,
-    asset,
-    AssetCheckSpec,
-    MaterializeResult,
-    PipesSubprocessClient,
-    file_relative_path,
-    Field, 
-    String,
+def create_table_schema_from_dict(type_dict):
+    # Map Python types to SQL types
+    type_mapping = {
+        'float': 'FLOAT',
+        'int': 'INTEGER',
+        'numeric': 'numeric',
+        'str': 'VARCHAR',
+        'bool': 'BOOLEAN',
+        'datetime.date': 'DATE',
+        'datetime.datetime': 'TIMESTAMP'
+    }
+
+    columns = []
+    for col_name, col_type in type_dict.items():
+        columns.append(
+            dg.TableColumn(
+                name=col_name,
+                type=type_mapping.get(col_type, 'VARCHAR'),
+                description=f"Column {col_name} of type {col_type}",
+            )
+        )
+
+    return dg.TableSchema(
+        columns=columns,
     )
 
-@asset
+
+# example that runs an R script without modification. R script runs but does not report anything in Dagster other than succes.
+@dg.asset
 def hello_world_r(
-    context: AssetExecutionContext,
-    pipes_subprocess_client: PipesSubprocessClient,
-) -> MaterializeResult:
-    cmd = [shutil.which("Rscript"), file_relative_path(__file__, "./R/hello_world.R")]
+    context: dg.AssetExecutionContext,
+    pipes_subprocess_client: dg.PipesSubprocessClient,
+) -> dg.MaterializeResult:
+    cmd = [shutil.which("Rscript"), dg.file_relative_path(__file__, "./R/hello_world.R")]
     return pipes_subprocess_client.run(
         command=cmd,
         context=context,
     ).get_materialize_result()
 
 
-@asset(
-    config_schema={"output_dir": Field(String, default_value="./data")},
+
+
+@dg.asset(
+    config_schema={"output_dir": dg.Field(dg.String, default_value="./data")},
     check_specs=[
-        AssetCheckSpec(name="no_missing_sepal_length_check_r", asset="iris_r"),
-        AssetCheckSpec(name="no_missing_sepal_width_check_r", asset="iris_r"),
-        AssetCheckSpec(name="no_missing_petal_length_check_r", asset="iris_r"),
-        AssetCheckSpec(name="no_missing_petal_width_check_r", asset="iris_r"),
-        AssetCheckSpec(name="species_name_check_r", asset="iris_r"),
+        dg.AssetCheckSpec(name="no_missing_sepal_length_check_r", asset="iris_r"),
+        dg.AssetCheckSpec(name="no_missing_sepal_width_check_r", asset="iris_r"),
+        dg.AssetCheckSpec(name="no_missing_petal_length_check_r", asset="iris_r"),
+        dg.AssetCheckSpec(name="no_missing_petal_width_check_r", asset="iris_r"),
+        dg.AssetCheckSpec(name="species_name_check_r", asset="iris_r"),
         ],
     )
 def iris_r(
-    context: AssetExecutionContext,
-    pipes_subprocess_client: PipesSubprocessClient,
-) -> MaterializeResult:
+    context: dg.AssetExecutionContext,
+    pipes_subprocess_client: dg.PipesSubprocessClient,
+) -> dg.MaterializeResult:
     output_dir = context.op_config["output_dir"]
-    cmd = [shutil.which("Rscript"), file_relative_path(__file__, "./R/iris.R")]
-    return pipes_subprocess_client.run(
+    cmd = [shutil.which("Rscript"), dg.file_relative_path(__file__, "./R/iris.R")]
+    result = pipes_subprocess_client.run(
         command=cmd,
         context=context,
         env={
             "MY_ENV_VAR_IN_SUBPROCESS": "This is an environment variable passed from Dagster to R!",
             "OUTPUT_DIR": output_dir,
         },
-    ).get_materialize_result()
+    )
+
+    result_message = result.get_custom_messages()[0]
+    schema = create_table_schema_from_dict(result_message.get("column_types"))
+
+    context.add_output_metadata(output_name = "result", metadata={
+       "dagster/row_count": dg.MetadataValue.int(result_message.get("dagster/row_count")), 
+                "preview": dg.MetadataValue.md(result_message.get("preview")),
+                "dagster/column_schema": schema,
+    })
+
+    return result.get_results()
+
 
 
-@asset(deps=[iris_r])
+@dg.asset(deps=[iris_r])
 def iris_py(context):
     # TODO replace hardcoded output_dir with resource key
     iris = pd.read_csv(f"data/iris.csv")

diff --git a/dagster_and_r/definitions.py b/dagster_and_r/definitions.py
@@ -0,0 +1,30 @@
+from dagster import (
+    Definitions,
+    PipesSubprocessClient,
+    )
+from . jobs import docker_container_op_r
+from . asset_checks import (
+    # no_missing_sepal_length_check_r,
+    no_missing_sepal_length_check_py,
+    )
+
+# python_assets = load_assets_from_modules([assets])
+from . assets import (
+    hello_world_r,
+    iris_r,
+    iris_py,
+    )
+
+defs = Definitions(
+    assets=[
+        hello_world_r,
+        iris_r,
+        iris_py,
+        ],
+    asset_checks=[
+        # no_missing_sepal_length_check_r,
+        no_missing_sepal_length_check_py,
+        ],
+    jobs=[docker_container_op_r],
+    resources={"pipes_subprocess_client": PipesSubprocessClient()},
+)
diff --git a/dagster_and_r/hello.py b/dagster_and_r/hello.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from dagster-and-r!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hello.py b/hello.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from dagster-and-r!")
+
+
+if __name__ == "__main__":
+    main()