From e01755cdcf2757be08d237b2cd4970301d6f927c Mon Sep 17 00:00:00 2001
From: Eivind Jahren <ejah@equinor.com>
Date: Fri, 16 Aug 2024 15:52:59 +0200
Subject: [PATCH] Revert "Move csv_export2 into ert"

This reverts commit 00c73cbef1e4eed24d18cc08983615906945a935.
---
 pyproject.toml                                |   3 +
 src/semeio/workflows/csv_export2/__init__.py  |   0
 .../workflows/csv_export2/csv_export2.py      | 154 +++++++++++
 tests/test_console_scripts.py                 |   1 +
 tests/workflows/csv_export2/__init__.py       |   0
 tests/workflows/csv_export2/conftest.py       |  69 +++++
 .../test_ert_integration_errors/csv_data.csv  |  17 ++
 .../workflows/csv_export2/test_integration.py | 255 ++++++++++++++++++
 8 files changed, 499 insertions(+)
 create mode 100644 src/semeio/workflows/csv_export2/__init__.py
 create mode 100644 src/semeio/workflows/csv_export2/csv_export2.py
 create mode 100644 tests/workflows/csv_export2/__init__.py
 create mode 100644 tests/workflows/csv_export2/conftest.py
 create mode 100644 tests/workflows/csv_export2/snapshots/test_integration/test_ert_integration_errors/csv_data.csv
 create mode 100644 tests/workflows/csv_export2/test_integration.py

diff --git a/pyproject.toml b/pyproject.toml
index 7a2765314..7bca2fd7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
     "scipy",
     "xlrd",
     "pyscal>=0.4.0",
+    "fmu-ensemble>1.6.5",
     "segyio",
     "xtgeo>=2.15",
 ]
@@ -48,10 +49,12 @@ repository = "https://github.com/equinor/semeio"
 
 [project.entry-points."ert"]
 semeio_forward_models = "semeio.hook_implementations.forward_models"
+CsvExport2Job = "semeio.workflows.csv_export2.csv_export2"
 AhmAnalysisJob = "semeio.workflows.ahm_analysis.ahmanalysis"
 LocalisationConfigJob = "semeio.workflows.localisation.local_config_script"
 
 [project.entry-points."console_scripts"]
+csv_export2 = "semeio.workflows.csv_export2.csv_export2:cli"
 overburden_timeshift = "semeio.forward_models.scripts.overburden_timeshift:main_entry_point"
 design2params = "semeio.forward_models.scripts.design2params:main_entry_point"
 gendata_rft = "semeio.forward_models.scripts.gendata_rft:main_entry_point"
diff --git a/src/semeio/workflows/csv_export2/__init__.py b/src/semeio/workflows/csv_export2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/semeio/workflows/csv_export2/csv_export2.py b/src/semeio/workflows/csv_export2/csv_export2.py
new file mode 100644
index 000000000..b916184e2
--- /dev/null
+++ b/src/semeio/workflows/csv_export2/csv_export2.py
@@ -0,0 +1,154 @@
+import argparse
+import sys
+
+import pandas as pd
+from ert import ErtScript, plugin
+from fmu import ensemble
+
+DESCRIPTION = """
+CSV_EXPORT2 will export selected Eclipse summary vectors to a CSV file.
+The vector selection is independent of the ``SUMMARY`` keywords in the
+ert config file.
+
+The CSV file will look like:
+
+======== ==== =========== ==== ======
+ENSEMBLE REAL DATE        FOPR FOPT
+======== ==== =========== ==== ======
+iter-0   0    2020-01-01  800  0
+iter-0   0    2020-02-01  1000 365000
+iter-0   1    2020-01-01  700  0
+iter-0   1    2020-01-01  1100 401500
+======== ==== =========== ==== ======
+
+The time frequency must be chosen. If ``raw``, the original timesteps from
+Eclipse is chosen, and it will be individual pr. realization. If ``daily``,
+``weekly``, ``monthly``  or  ``yearly`` is chosen, only data at those dates are
+given for all realization. Rate data (e.g.  FOPR) is valid for the given dates,
+but can not be summed up to cumulative data when time interpolation. Cumulative
+columns (f.ex. FOPT) are time-interpolated linearly. See the `documentation on
+fmu-ensemble
+<https://equinor.github.io/fmu-ensemble/usage.html#rate-handling-in-eclipse-summary-vectors>`_
+for more details on rate handling.
+
+Columns are selected by a list of strings, where wildcards characters ``?``
+(matches exactly one character) and ``*`` (matches zero or more characters) can
+be used to select multiple columns.
+
+Column count more than 1000 gives increased probability for problems downstream,
+depending on which applications are put into use. Column count depends on the
+combination of wildcards used in this workflow and the actual vectors that are
+requested in the Eclipse DATA file. A wildcard like ``W*`` can in certain cases
+(e.g. Eclipse simulations with 100+ wells) produce thousands of vectors, and can
+then be replaced by something more explicit like ``WOPT* WGPT* WWPT*``.
+"""  # noqa
+
+EXAMPLES = """
+Example
+-------
+
+Add a file named e.g. ``ert/bin/workflows/QC_CSVEXPORT2`` with the contents::
+
+  MAKE_DIRECTORY <CASEDIR>/share/summary/
+  EXPORT_RUNPATH * | *
+  CSV_EXPORT2 <RUNPATH_FILE> <CASEDIR>/share/summary/<CASE>.csv monthly F* W* TCPU TIMESTEP
+
+(where ``<CASEDIR>`` typically points to ``/scratch/..``). Adjust all three
+lines to your needs.
+
+``EXPORT_RUNPATH`` in the workflow file is added to ensure all realizations and
+all iterations are included in the RUNPATH file.  If you have rerun only a
+subset of your ensemble, the RUNPATH file will only contain those unless this
+statement is included.
+
+Add to your ERT config to have the workflow automatically executed on successful
+runs::
+
+  LOAD_WORKFLOW ../bin/workflows/QC_CSVEXPORT2
+  HOOK_WORKFLOW QC_CSVEXPORT2 POST_SIMULATION
+
+"""  # noqa
+
+
+def csv_exporter(runpathfile, time_index, outputfile, column_keys=None):
+    """Export CSV data (summary and parameters) from an EnsembleSet
+
+    The EnsembleSet is described by a runpathfile which must exists
+    and point to realizations"""
+    ensemble_set = ensemble.EnsembleSet(
+        name="ERT EnsembleSet for CSV_EXPORT2", runpathfile=runpathfile
+    )
+    try:
+        summary = ensemble_set.load_smry(time_index=time_index, column_keys=column_keys)
+        parameters = ensemble_set.parameters
+    except KeyError as exc:
+        raise UserWarning("No data found") from exc
+
+    if not parameters.empty:
+        pd.merge(summary, parameters).to_csv(outputfile, index=False)
+    else:
+        summary.to_csv(outputfile, index=False)
+
+
+class CsvExport2Job(ErtScript):
+    def run(self, *args, **_):
+        main(args)
+
+
+def main(args):
+    parser = csv_export_parser()
+    args = parser.parse_args(args)
+
+    csv_exporter(
+        runpathfile=args.runpathfile,
+        time_index=args.time_index,
+        outputfile=args.outputfile,
+        column_keys=args.column_keys,
+    )
+
+    print(f"{args.time_index} csv-export written to {args.outputfile}")
+
+
+def csv_export_parser():
+    """Setup parser"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "runpathfile",
+        type=str,
+        help=(
+            "Path to ERT RUNPATH-file, "
+            "usually the ERT magic variable <RUNPATH_FILE> can be used"
+        ),
+    )
+    parser.add_argument(
+        "outputfile",
+        type=str,
+        help="Path to CSV file to be written. The directory pointed to must exist.",
+    )
+    parser.add_argument(
+        "time_index",
+        type=str,
+        default="monthly",
+        help=(
+            "Time interval specifier for the output. "
+            "This argument is passed on to fmu-ensemble, "
+            "supported specifiers are 'raw', 'daily', 'weekly', 'monthly' and 'yearly'"
+        ),
+    )
+    parser.add_argument(
+        "column_keys", nargs="+", default=None, help="List of summary vector wildcards"
+    )
+    return parser
+
+
+@plugin(name="semeio")
+def legacy_ertscript_workflow(config):
+    workflow = config.add_workflow(CsvExport2Job, "CSV_EXPORT2")
+    workflow.parser = csv_export_parser
+    workflow.description = DESCRIPTION
+    workflow.examples = EXAMPLES
+    workflow.category = "export"
+
+
+def cli():
+    main(sys.argv[1:])
diff --git a/tests/test_console_scripts.py b/tests/test_console_scripts.py
index e8ef66e25..e527c8279 100644
--- a/tests/test_console_scripts.py
+++ b/tests/test_console_scripts.py
@@ -5,6 +5,7 @@
 @pytest.mark.parametrize(
     "entry_point",
     [
+        "csv_export2",
         "overburden_timeshift",
         "design2params",
         "gendata_rft",
diff --git a/tests/workflows/csv_export2/__init__.py b/tests/workflows/csv_export2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/workflows/csv_export2/conftest.py b/tests/workflows/csv_export2/conftest.py
new file mode 100644
index 000000000..84046c0cc
--- /dev/null
+++ b/tests/workflows/csv_export2/conftest.py
@@ -0,0 +1,69 @@
+import os
+
+import pytest
+
+NORNE_DIR = os.path.join(os.path.dirname(__file__), "../../test_data/norne")
+
+
+def mock_norne_data(reals, iters, parameters=True):
+    # pylint: disable=consider-using-f-string
+    """From a single UNSMRY file, produce arbitrary sized ensembles.
+
+    Summary data will be equivalent over realizations, but the
+    parameters.txt is made unique.
+
+    Writes realization-*/iter-* file structure in cwd.
+
+    Args:
+        reals (list): integers with realization indices wanted
+        iters (list): integers with iter indices wanted
+        parameters (bool): Whether to write parameters.txt in each runpath
+    """
+    for real in reals:
+        for iteration in iters:
+            runpath = os.path.join(f"realization-{real}", f"iter-{iteration}")
+
+            os.makedirs(runpath, exist_ok=True)
+
+            os.symlink(
+                os.path.join(NORNE_DIR, "NORNE_ATW2013.UNSMRY"),
+                os.path.join(runpath, f"NORNE_{real}.UNSMRY"),
+            )
+            os.symlink(
+                os.path.join(NORNE_DIR, "NORNE_ATW2013.SMSPEC"),
+                os.path.join(runpath, f"NORNE_{real}.SMSPEC"),
+            )
+            if parameters:
+                with open(
+                    os.path.join(runpath, "parameters.txt"), "w", encoding="utf-8"
+                ) as p_fileh:
+                    p_fileh.write(f"FOO 1{real}{iteration}")
+            # Ensure fmu-ensemble does not complain on missing STATUS
+            with open(os.path.join(runpath, "STATUS"), "w", encoding="utf-8") as file_h:
+                file_h.write("a:b\na: 09:00:00 .... 09:00:01")
+
+    with open("runpathfile", "w", encoding="utf-8") as file_h:
+        for iteration in iters:
+            for real in reals:
+                runpath = os.path.join(f"realization-{real}", f"iter-{iteration}")
+                file_h.write(f"{real:03d} {runpath} NORNE_{real} {iteration:03d}\n")
+
+
+@pytest.fixture()
+def norne_mocked_ensembleset(setup_tmpdir):
+    # pylint: disable=unused-argument
+    mock_norne_data(reals=[0, 1], iters=[0, 1], parameters=True)
+
+
+@pytest.fixture()
+def norne_mocked_ensembleset_noparams(setup_tmpdir):
+    # pylint: disable=unused-argument
+    mock_norne_data(reals=[0, 1], iters=[0, 1], parameters=False)
+
+
+@pytest.fixture(name="setup_tmpdir")
+def fixture_setup_tmpdir(tmpdir):
+    cwd = os.getcwd()
+    tmpdir.chdir()
+    yield
+    os.chdir(cwd)
diff --git a/tests/workflows/csv_export2/snapshots/test_integration/test_ert_integration_errors/csv_data.csv b/tests/workflows/csv_export2/snapshots/test_integration/test_ert_integration_errors/csv_data.csv
new file mode 100644
index 000000000..13fef105f
--- /dev/null
+++ b/tests/workflows/csv_export2/snapshots/test_integration/test_ert_integration_errors/csv_data.csv
@@ -0,0 +1,17 @@
+,ENSEMBLE,REAL,DATE,FOPT,FOO
+0,iter-0,0,1997-11-01,0.0,100
+1,iter-0,0,1997-12-01,131841.109375,100
+2,iter-0,0,1998-01-01,427230.78125,100
+3,iter-0,0,1998-02-01,954872.8125,100
+4,iter-0,1,1997-11-01,0.0,110
+5,iter-0,1,1997-12-01,131841.109375,110
+6,iter-0,1,1998-01-01,427230.78125,110
+7,iter-0,1,1998-02-01,954872.8125,110
+8,iter-1,0,1997-11-01,0.0,101
+9,iter-1,0,1997-12-01,131841.109375,101
+10,iter-1,0,1998-01-01,427230.78125,101
+11,iter-1,0,1998-02-01,954872.8125,101
+12,iter-1,1,1997-11-01,0.0,111
+13,iter-1,1,1997-12-01,131841.109375,111
+14,iter-1,1,1998-01-01,427230.78125,111
+15,iter-1,1,1998-02-01,954872.8125,111
diff --git a/tests/workflows/csv_export2/test_integration.py b/tests/workflows/csv_export2/test_integration.py
new file mode 100644
index 000000000..811592bd7
--- /dev/null
+++ b/tests/workflows/csv_export2/test_integration.py
@@ -0,0 +1,255 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import pandas as pd
+import pytest
+import rstcheck_core.checker
+
+from semeio.workflows.csv_export2 import csv_export2
+
+NORNE_VECS = ["FGPT", "FLPT", "FOPT", "FVPT", "FWPT"]
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_that_a_not_found_realization_is_skipped():
+    shutil.rmtree("realization-1/iter-1")
+    csv_export2.csv_exporter(
+        runpathfile="runpathfile",
+        time_index="yearly",
+        outputfile="unsmry--yearly.csv",
+        column_keys=["F?PT"],
+    )
+    verify_exported_file(
+        "unsmry--yearly.csv",
+        ["ENSEMBLE", "REAL", "DATE"] + NORNE_VECS + ["FOO"],
+        {
+            ("iter-0", 0),
+            ("iter-0", 1),
+            ("iter-1", 0),
+        },
+    )
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_that_a_failed_realization_is_skipped():
+    os.remove("realization-0/iter-1/NORNE_0.SMSPEC")
+    csv_export2.csv_exporter(
+        runpathfile="runpathfile",
+        time_index="yearly",
+        outputfile="unsmry--yearly.csv",
+        column_keys=["F?PT"],
+    )
+    verify_exported_file(
+        "unsmry--yearly.csv",
+        ["ENSEMBLE", "REAL", "DATE"] + NORNE_VECS + ["FOO"],
+        {
+            ("iter-0", 0),
+            ("iter-0", 1),
+            ("iter-1", 1),
+        },
+    )
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_that_a_missing_realization_index_is_ok():
+    rp_lines = Path("runpathfile").read_text(encoding="utf-8").splitlines()
+    Path("sliced_runpathfile").write_text(
+        rp_lines[1] + "\n" + rp_lines[3], encoding="utf-8"
+    )
+    csv_export2.csv_exporter(
+        runpathfile="sliced_runpathfile",
+        time_index="yearly",
+        outputfile="unsmry--yearly.csv",
+        column_keys=["F?PT"],
+    )
+    verify_exported_file(
+        "unsmry--yearly.csv",
+        ["ENSEMBLE", "REAL", "DATE"] + NORNE_VECS + ["FOO"],
+        {
+            ("iter-0", 1),
+            ("iter-1", 1),
+        },
+    )
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_that_iterations_in_runpathfile_cannot_be_defaulted():
+    shutil.move("realization-0/iter-0", "real0")
+    shutil.move("realization-1/iter-0", "real1")
+    shutil.rmtree("realization-0")
+    shutil.rmtree("realization-1")
+    Path("runpathfile").write_text(
+        "000 real0 NORNE_0\n001 real1 NORNE_1\n", encoding="utf-8"
+    )
+
+    with pytest.raises(UserWarning):
+        csv_export2.csv_exporter(
+            runpathfile="runpathfile",
+            time_index="yearly",
+            outputfile="unsmry--yearly.csv",
+            column_keys=["F?PT"],
+        )
+
+
+def test_empty_file_yields_user_warning():
+    with open("empty_file", "a", encoding="utf-8") as empty_file, pytest.raises(
+        UserWarning, match="No data found"
+    ):
+        csv_export2.csv_exporter(
+            runpathfile=empty_file.name,
+            time_index="raw",
+            outputfile="unsmry--yearly.csv",
+            column_keys=["*"],
+        )
+
+
+@pytest.mark.parametrize("input_rst", [csv_export2.DESCRIPTION, csv_export2.EXAMPLES])
+def test_valid_rst(input_rst):
+    """
+    Check that the documentation passed through the plugin system is
+    valid rst
+    """
+    assert not list(rstcheck_core.checker.check_source(input_rst))
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_norne_ensemble():
+    csv_export2.csv_exporter(
+        runpathfile="runpathfile",
+        time_index="yearly",
+        outputfile="unsmry--yearly.csv",
+        column_keys=["F?PT"],
+    )
+    verify_exported_file(
+        "unsmry--yearly.csv",
+        ["ENSEMBLE", "REAL", "DATE"] + NORNE_VECS + ["FOO"],
+        {
+            ("iter-0", 0),
+            ("iter-0", 1),
+            ("iter-1", 0),
+            ("iter-1", 1),
+        },
+    )
+
+
+@pytest.mark.usefixtures("norne_mocked_ensembleset_noparams")
+def test_norne_ensemble_noparams():
+    csv_export2.csv_exporter(
+        runpathfile="runpathfile",
+        time_index="yearly",
+        outputfile="unsmry--yearly.csv",
+        column_keys=["FOPT"],
+    )
+    verify_exported_file(
+        "unsmry--yearly.csv",
+        ["ENSEMBLE", "REAL", "DATE", "FOPT"],
+        {
+            ("iter-0", 0),
+            ("iter-0", 1),
+            ("iter-1", 0),
+            ("iter-1", 1),
+        },
+    )
+
+
+def verify_exported_file(exported_file_name, result_header, result_iter_rel):
+    """Verify an exported CSV file with respect to:
+
+        * Exactly the set of requested headers is found
+        * The realizations and iterations that exist must equal
+          given set of tuples.
+
+    Args:
+        exported_file_name (str): path to CSV file.
+        result_header (list of str): The strings required in the header.
+        result_iter_real (set): Set of 2-tuples: {(iterstring, realidx)}
+    """
+    dframe = pd.read_csv(exported_file_name)
+    assert set(dframe.columns) == set(result_header)
+    assert (
+        set(dframe[["ENSEMBLE", "REAL"]].itertuples(index=False, name=None))
+        == result_iter_rel
+    )
+
+
+@pytest.mark.ert_integration
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_ert_integration():
+    """Mock an ERT config and test the workflow"""
+    with open("FOO.DATA", "w", encoding="utf-8") as file_h:
+        file_h.write("--Empty")
+
+    with open("wf_csvexport", "w", encoding="utf-8") as file_h:
+        file_h.write(
+            # This workflow is representing the example in csv_export2.py:
+            "MAKE_DIRECTORY csv_output\n"
+            "EXPORT_RUNPATH * | *\n"  # (not really relevant in mocked case)
+            "CSV_EXPORT2 runpathfile csv_output/data.csv monthly FOPT\n"
+            # Example in documentation uses <RUNPATH_FILE> which is
+            # linked to the RUNPATH keyword that we don't use in this
+            # test (mocking data gets more complex if that is to be used)
+        )
+
+    ert_config = [
+        "ECLBASE FOO.DATA",
+        "QUEUE_SYSTEM LOCAL",
+        "NUM_REALIZATIONS 2",
+        "LOAD_WORKFLOW wf_csvexport",
+        "HOOK_WORKFLOW wf_csvexport PRE_SIMULATION",
+    ]
+
+    ert_config_fname = "test.ert"
+    with open(ert_config_fname, "w", encoding="utf-8") as file_h:
+        file_h.write("\n".join(ert_config))
+
+    subprocess.run(["ert", "test_run", ert_config_fname], check=True)
+
+    assert pd.read_csv("csv_output/data.csv").shape == (16, 5)
+
+
+@pytest.mark.ert_integration
+@pytest.mark.usefixtures("norne_mocked_ensembleset")
+def test_ert_integration_errors(snapshot):
+    """Test CSV_EXPORT2 when runpathfile points to non-existing realizations
+
+    This test proves that CSV_EXPORT2 happily skips non-existing
+    realizations, but emits a warning that there is no STATUS file.
+    """
+    with open("FOO.DATA", "w", encoding="utf-8") as file_h:
+        file_h.write("--Empty")
+
+    # Append a not-existing realizations to the runpathfile:
+    with open("runpathfile", "a", encoding="utf-8") as file_h:
+        file_h.write("002 realization-2/iter-0 NORNE_1 000")
+
+    with open("wf_csvexport", "w", encoding="utf-8") as file_h:
+        file_h.write("CSV_EXPORT2 runpathfile data.csv monthly FOPT\n")
+
+    ert_config = [
+        "ECLBASE FOO.DATA",
+        "QUEUE_SYSTEM LOCAL",
+        "NUM_REALIZATIONS 2",
+        "LOAD_WORKFLOW wf_csvexport",
+        "HOOK_WORKFLOW wf_csvexport PRE_SIMULATION",
+    ]
+
+    ert_config_fname = "test.ert"
+    with open(ert_config_fname, "w", encoding="utf-8") as file_h:
+        file_h.write("\n".join(ert_config))
+
+    subprocess.run(["ert", "test_run", ert_config_fname], check=True)
+
+    log_file = next(Path("logs").glob("ert-log*txt"))
+    ertlog = log_file.read_text(encoding="utf-8")
+
+    assert "No STATUS file" in ertlog
+    assert "realization-2/iter-0" in ertlog
+
+    assert os.path.exists("data.csv")
+    data = pd.read_csv("data.csv")
+    snapshot.assert_match(
+        data.to_csv(lineterminator="\n"),
+        "csv_data.csv",
+    )