From 81c48fce08bd5f919ff349266adcd404ab6297e5 Mon Sep 17 00:00:00 2001
From: maxymnaumchyk <70752300+maxymnaumchyk@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:26:06 +0300
Subject: [PATCH] feat: to TensorFlow RaggedTensor (#3210)

* feat: to TensorFlow RaggedTensor

* style: pre-commit fixes

* fix the tensorflow library import

* style: pre-commit fixes

* update exception

* added some tests for different data types conversions

* change end of line formats

* add tensorflow library to the test-full-requirements

* add tensorflow library to the test-full-requirements

* change tensorflow library version

* change tensorflow library version

* change tensorflow library version

* add a new github actions test for ml libraries

* delete tensorflow from full test requirements

* update requirements-test-ml.txt

* update requirements-test-full.txt

* update requirements-test-ml.txt

* update the docstring for the main function

* add a new function from_raggedtensor

* delete import of tensorflow library

* minor changes

* fix the tests names

Co-authored-by: Ianna Osborne <ianna.osborne@cern.ch>

* Apply suggestions from code review

Co-authored-by: Jim Pivarski <jpivarski@users.noreply.github.com>

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ianna Osborne <ianna.osborne@cern.ch>
Co-authored-by: Jim Pivarski <jpivarski@users.noreply.github.com>
---
 .github/workflows/test.yml                    |   4 +
 requirements-test-ml.txt                      |   6 +
 src/awkward/operations/__init__.py            |   2 +
 .../operations/ak_from_raggedtensor.py        |  67 ++++++++++
 src/awkward/operations/ak_to_raggedtensor.py  |  84 ++++++++++++
 ..._3210_to_raggedtensor_from_raggedtensor.py | 122 ++++++++++++++++++
 6 files changed, 285 insertions(+)
 create mode 100644 requirements-test-ml.txt
 create mode 100644 src/awkward/operations/ak_from_raggedtensor.py
 create mode 100644 src/awkward/operations/ak_to_raggedtensor.py
 create mode 100644 tests/test_3210_to_raggedtensor_from_raggedtensor.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ea52274645..5ef43543af 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -65,6 +65,10 @@ jobs:
             python-architecture: x64
             runs-on: ubuntu-latest
             dependencies-kind: pypy
+          - python-version: '3.11'
+            python-architecture: x64
+            runs-on: ubuntu-latest
+            dependencies-kind: ml
 
     runs-on: ${{ matrix.runs-on }}
 
diff --git a/requirements-test-ml.txt b/requirements-test-ml.txt
new file mode 100644
index 0000000000..d715854439
--- /dev/null
+++ b/requirements-test-ml.txt
@@ -0,0 +1,6 @@
+fsspec>=2022.11.0;sys_platform != "win32"
+pytest>=6
+pytest-cov
+pytest-xdist
+tensorflow >= 2.12
+torch >= 2.4.0
diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py
index d0cee81508..6d4a84c565 100644
--- a/src/awkward/operations/__init__.py
+++ b/src/awkward/operations/__init__.py
@@ -44,6 +44,7 @@
 from awkward.operations.ak_from_json import *
 from awkward.operations.ak_from_numpy import *
 from awkward.operations.ak_from_parquet import *
+from awkward.operations.ak_from_raggedtensor import *
 from awkward.operations.ak_from_rdataframe import *
 from awkward.operations.ak_from_regular import *
 from awkward.operations.ak_full_like import *
@@ -97,6 +98,7 @@
 from awkward.operations.ak_to_parquet import *
 from awkward.operations.ak_to_parquet_dataset import *
 from awkward.operations.ak_to_parquet_row_groups import *
+from awkward.operations.ak_to_raggedtensor import *
 from awkward.operations.ak_to_rdataframe import *
 from awkward.operations.ak_to_regular import *
 from awkward.operations.ak_transform import *
diff --git a/src/awkward/operations/ak_from_raggedtensor.py b/src/awkward/operations/ak_from_raggedtensor.py
new file mode 100644
index 0000000000..ba20382caf
--- /dev/null
+++ b/src/awkward/operations/ak_from_raggedtensor.py
@@ -0,0 +1,67 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+import awkward as ak
+from awkward._dispatch import high_level_function
+
+__all__ = ("from_raggedtensor",)
+
+
+@high_level_function()
+def from_raggedtensor(array):
+    """
+    Args:
+        array: (`tensorflow.RaggedTensor`):
+        RaggedTensor to convert into an  Awkward Array.
+
+    Converts a TensorFlow RaggedTensor into an Awkward Array.
+
+    If `array` contains any other data types the function raises an error.
+    """
+
+    # Dispatch
+    yield (array,)
+
+    # Implementation
+    return _impl(array)
+
+
+def _impl(array):
+    try:
+        # get the flat values
+        content = array.flat_values.numpy()
+    except AttributeError as err:
+        raise TypeError(
+            """only RaggedTensor can be converted to awkward array"""
+        ) from err
+    # convert them to ak.contents right away
+    content = ak.contents.NumpyArray(content)
+
+    # get the offsets
+    offsets_arr = []
+    for splits in array.nested_row_splits:
+        split = splits.numpy()
+        # convert to ak.index
+        offset = ak.index.Index64(split)
+        offsets_arr.append(offset)
+
+    # if a tensor has one *ragged dimension*
+    if len(offsets_arr) == 1:
+        result = ak.contents.ListOffsetArray(offsets_arr[0], content)
+        return ak.Array(result)
+
+    # if a tensor has multiple *ragged dimensions*
+    return ak.Array(_recursive_call(content, offsets_arr, 0))
+
+
+def _recursive_call(content, offsets_arr, count):
+    if count == len(offsets_arr) - 2:
+        return ak.contents.ListOffsetArray(
+            offsets_arr[count],
+            ak.contents.ListOffsetArray(offsets_arr[count + 1], content),
+        )
+    else:
+        return ak.contents.ListOffsetArray(
+            offsets_arr[count], _recursive_call(content, offsets_arr, count)
+        )
diff --git a/src/awkward/operations/ak_to_raggedtensor.py b/src/awkward/operations/ak_to_raggedtensor.py
new file mode 100644
index 0000000000..16c15dcd47
--- /dev/null
+++ b/src/awkward/operations/ak_to_raggedtensor.py
@@ -0,0 +1,84 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+import awkward as ak
+from awkward._dispatch import high_level_function
+
+__all__ = ("to_raggedtensor",)
+
+
+@high_level_function()
+def to_raggedtensor(array):
+    """
+    Args:
+        array: Array-like data. May be a high level #ak.Array,
+        or low-level #ak.contents.ListOffsetArray, #ak.contents.ListArray,
+        #ak.contents.RegularArray, #ak.contents.NumpyArray
+
+    Converts `array` (only ListOffsetArray, ListArray, RegularArray and NumpyArray data types supported)
+    into a ragged tensor, if possible.
+
+    If `array` contains any other data types (RecordArray for example) the function raises an error.
+    """
+
+    # Dispatch
+    yield (array,)
+
+    # Implementation
+    return _impl(array)
+
+
+def _impl(array):
+    try:
+        import tensorflow as tf
+    except ImportError as err:
+        raise ImportError(
+            """to use ak.to_raggedtensor, you must install the 'tensorflow' package with:
+
+        pip install tensorflow
+or
+        conda install tensorflow"""
+        ) from err
+
+    # unwrap the awkward array if it was made with ak.Array function
+    # also transforms a python list to awkward array
+    array = ak.to_layout(array, allow_record=False)
+
+    if isinstance(array, ak.contents.numpyarray.NumpyArray):
+        return tf.RaggedTensor.from_row_splits(
+            values=array.data, row_splits=[0, array.__len__()]
+        )
+    else:
+        flat_values, nested_row_splits = _recursive_call(array, ())
+
+        return tf.RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits)
+
+
+def _recursive_call(layout, offsets_arr):
+    try:
+        # change all the possible layout types to ListOffsetArray
+        if isinstance(layout, ak.contents.listarray.ListArray):
+            layout = layout.to_ListOffsetArray64()
+        elif isinstance(layout, ak.contents.regulararray.RegularArray):
+            layout = layout.to_ListOffsetArray64()
+        elif not isinstance(
+            layout,
+            (
+                ak.contents.listoffsetarray.ListOffsetArray,
+                ak.contents.numpyarray.NumpyArray,
+            ),
+        ):
+            raise TypeError(
+                "Only arrays containing variable-length lists (var *) or"
+                " regular-length lists (# *) of numbers can be converted into a TensorFlow RaggedTensor"
+            )
+
+        # recursively gather all of the offsets of an array
+        offsets_arr += (layout.offsets.data,)
+
+    except AttributeError:
+        # at the last iteration form a ragged tensor from the
+        # accumulated offsets and flattened values of the array
+        return layout.data, offsets_arr
+    return _recursive_call(layout.content, offsets_arr)
diff --git a/tests/test_3210_to_raggedtensor_from_raggedtensor.py b/tests/test_3210_to_raggedtensor_from_raggedtensor.py
new file mode 100644
index 0000000000..d250910c09
--- /dev/null
+++ b/tests/test_3210_to_raggedtensor_from_raggedtensor.py
@@ -0,0 +1,122 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import awkward as ak
+
+to_raggedtensor = ak.operations.to_raggedtensor
+from_raggedtensor = ak.operations.from_raggedtensor
+
+tf = pytest.importorskip("tensorflow")
+
+content = ak.contents.NumpyArray(
+    np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
+)
+starts1 = ak.index.Index64(np.array([0, 3, 3, 5, 6]))
+stops1 = ak.index.Index64(np.array([3, 3, 5, 6, 9]))
+starts2 = ak.index.Index64(np.array([0, 3]))
+stops2 = ak.index.Index64(np.array([3, 5]))
+
+array = np.arange(2 * 3 * 5).reshape(2, 3, 5)
+content2 = ak.contents.NumpyArray(array.reshape(-1))
+inneroffsets = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30]))
+outeroffsets = ak.index.Index64(np.array([0, 3, 6]))
+
+
+def test_convert_to_raggedtensor():
+    # a test for ListArray -> RaggedTensor
+    array1 = ak.contents.ListArray(starts1, stops1, content)
+    assert to_raggedtensor(array1).to_list() == [
+        [1.1, 2.2, 3.3],
+        [],
+        [4.4, 5.5],
+        [6.6],
+        [7.7, 8.8, 9.9],
+    ]
+
+    # a test for awkward.highlevel.Array -> RaggedTensor
+    array2 = ak.Array(array1)
+    assert to_raggedtensor(array2).to_list() == [
+        [1.1, 2.2, 3.3],
+        [],
+        [4.4, 5.5],
+        [6.6],
+        [7.7, 8.8, 9.9],
+    ]
+
+    # a test for NumpyArray -> RaggedTensor
+    array3 = content
+    assert to_raggedtensor(array3).to_list() == [
+        [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9]
+    ]
+
+    # a test for RegularArray -> RaggedTensor
+    array4 = ak.contents.RegularArray(content, size=2)
+    assert to_raggedtensor(array4).to_list() == [
+        [1.1, 2.2],
+        [3.3, 4.4],
+        [5.5, 6.6],
+        [7.7, 8.8],
+    ]
+
+    # try a single line awkward array
+    array5 = ak.Array([3, 1, 4, 1, 9, 2, 6])
+    assert to_raggedtensor(array5).to_list() == [[3, 1, 4, 1, 9, 2, 6]]
+
+    # try a multiple ragged array
+    array6 = ak.Array([[[1.1, 2.2], [3.3]], [], [[4.4, 5.5]]])
+    assert to_raggedtensor(array6).to_list() == [[[1.1, 2.2], [3.3]], [], [[4.4, 5.5]]]
+
+    # try a listoffset array inside a listoffset array
+    array7 = ak.contents.ListOffsetArray(
+        outeroffsets, ak.contents.ListOffsetArray(inneroffsets, content2)
+    )
+    assert to_raggedtensor(array7).to_list() == [
+        [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]],
+        [[15, 16, 17, 18, 19], [20, 21, 22, 23, 24], [25, 26, 27, 28, 29]],
+    ]
+
+    # try a list array inside a list array
+
+    array8 = ak.contents.ListArray(
+        starts2, stops2, ak.contents.ListArray(starts1, stops1, content)
+    )
+    assert to_raggedtensor(array8).to_list() == [
+        [[1.1, 2.2, 3.3], [], [4.4, 5.5]],
+        [[6.6], [7.7, 8.8, 9.9]],
+    ]
+
+    # try just a python list
+    array9 = [3, 1, 4, 1, 9, 2, 6]
+    assert to_raggedtensor(array9).to_list() == [[3, 1, 4, 1, 9, 2, 6]]
+
+
+np_array1 = np.array([1.1, 2.2, 3.3, 4.4, 5.5], dtype=np.float32)
+
+offsets1 = ak.index.Index64(np.array([0, 2, 3, 3, 5]))
+content1 = ak.contents.NumpyArray(np_array1)
+
+
+def test_convert_from_raggedtensor():
+    tf_array1 = tf.RaggedTensor.from_row_splits(
+        values=[1.1, 2.2, 3.3, 4.4, 5.5], row_splits=[0, 2, 3, 3, 5]
+    )
+
+    ak_array1 = ak.contents.ListOffsetArray(offsets1, content1)
+    result1 = ak.to_layout(from_raggedtensor(tf_array1), allow_record=False)
+    assert (result1.content.data == np_array1).all()
+    assert (result1.offsets.data == [0, 2, 3, 3, 5]).all()
+    assert from_raggedtensor(tf_array1).to_list() == ak_array1.to_list()
+
+    tf_array2 = tf.RaggedTensor.from_nested_row_splits(
+        flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
+        nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]),
+    )
+    assert from_raggedtensor(tf_array2).to_list() == [
+        [[3, 1, 4, 1], [], [5, 9, 2]],
+        [],
+        [[6], []],
+    ]