-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: use pyarrow for string functions (#2616)
* First function is working: is_alnum. * is_alpha * is_decimal * is_lower * is_digit * is_numeric * is_printable * is_space * is_upper * is_title * is_ascii; done with string predicates * capitalize * lower * upper * upper * title * T -> T operations on bytestrings should return bytestrings. * repeat (the first that needs a broadcastable argument) * reverse (because it's easy) * replace_slice * replace_substring * Also test 'max_replacements' in replace_substring. * replace_substring_regex: done with string transforms * center * lpad and rpad * trim * trim_whitespace * ltrim * rtrim * rtrim_whitespace * ltrim_whitespace * slice * feat: add `split_whitespace` * test: add test for `split_whitespace` * test: correct test * feat: add `split_pattern` * refactor: rename `_get_action` * feat: add `ak_split_pattern_regex` * test: update tests for new features * Fixed UnmaskedArray._drop_none. * fix: adjust for numexpr 2.8.5, which hid getContext's frame_depth argument (#2617) * extract_regex. * join (almost entirely from https://gist.github.com/agoose77/28e5bb0250678e454356a85861a16368) * use dispatch correctly * fix: drop unused arg * join_element_wise * Revert "use dispatch correctly" This reverts commit 559073b. * fix: broadcast `num_repeats` * feat: add `count_substring[_pattern]` * docs: fixup docstring * feat: add `ends_with` * feat: add `starts_with` * docs: fix link * feat: add `find_substring` * docs: fix typo * feat: add `find_substring_regex` * docs: fix link * feat: add `match_like` * test: improve test * feat: add `match_substring`, `match_substring_regex` * feat: add `is_in` and `index_in` * fix: operate at leaf depth * refactor: add internal `pyarrow.compute` helper * refactor: use pyarrow import helper * refactor: add `module` and `name` arguments to `high_level_function` * fix: pass `module` to str `high_level_function` * docs: homogenize docstrings * docs: add see also * docs: include `ak.str` in toctree * chore: update pre-commit hooks (#2619) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * refactor: cleanup error handling * Rename ak_*.py modules -> akstr_*.py. --------- Co-authored-by: Angus Hollands <goosey15@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
85ca6d5
commit 1cfea2f
Showing
57 changed files
with
4,601 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE | ||
|
||
# https://arrow.apache.org/docs/python/api/compute.html#string-predicates | ||
|
||
# string predicates | ||
from awkward.operations.str.akstr_is_alnum import * | ||
from awkward.operations.str.akstr_is_alpha import * | ||
from awkward.operations.str.akstr_is_decimal import * | ||
from awkward.operations.str.akstr_is_digit import * | ||
from awkward.operations.str.akstr_is_lower import * | ||
from awkward.operations.str.akstr_is_numeric import * | ||
from awkward.operations.str.akstr_is_printable import * | ||
from awkward.operations.str.akstr_is_space import * | ||
from awkward.operations.str.akstr_is_upper import * | ||
from awkward.operations.str.akstr_is_title import * | ||
from awkward.operations.str.akstr_is_ascii import * | ||
|
||
# string transforms | ||
from awkward.operations.str.akstr_capitalize import * | ||
from awkward.operations.str.akstr_length import * | ||
from awkward.operations.str.akstr_lower import * | ||
from awkward.operations.str.akstr_swapcase import * | ||
from awkward.operations.str.akstr_title import * | ||
from awkward.operations.str.akstr_upper import * | ||
from awkward.operations.str.akstr_repeat import * | ||
from awkward.operations.str.akstr_replace_slice import * | ||
from awkward.operations.str.akstr_reverse import * | ||
from awkward.operations.str.akstr_replace_substring import * | ||
from awkward.operations.str.akstr_replace_substring_regex import * | ||
|
||
# string padding | ||
from awkward.operations.str.akstr_center import * | ||
from awkward.operations.str.akstr_lpad import * | ||
from awkward.operations.str.akstr_rpad import * | ||
|
||
# string trimming | ||
from awkward.operations.str.akstr_ltrim import * | ||
from awkward.operations.str.akstr_ltrim_whitespace import * | ||
from awkward.operations.str.akstr_rtrim import * | ||
from awkward.operations.str.akstr_rtrim_whitespace import * | ||
from awkward.operations.str.akstr_trim import * | ||
from awkward.operations.str.akstr_trim_whitespace import * | ||
|
||
# string splitting | ||
from awkward.operations.str.akstr_split_whitespace import * | ||
from awkward.operations.str.akstr_split_pattern import * | ||
from awkward.operations.str.akstr_split_pattern_regex import * | ||
|
||
# string component extraction | ||
|
||
from awkward.operations.str.akstr_extract_regex import * | ||
|
||
# string joining | ||
|
||
from awkward.operations.str.akstr_join import * | ||
from awkward.operations.str.akstr_join_element_wise import * | ||
|
||
# string slicing | ||
|
||
from awkward.operations.str.akstr_slice import * | ||
|
||
# containment tests | ||
|
||
from awkward.operations.str.akstr_count_substring import * | ||
from awkward.operations.str.akstr_count_substring_regex import * | ||
from awkward.operations.str.akstr_ends_with import * | ||
from awkward.operations.str.akstr_find_substring import * | ||
from awkward.operations.str.akstr_find_substring_regex import * | ||
from awkward.operations.str.akstr_index_in import * | ||
from awkward.operations.str.akstr_is_in import * | ||
from awkward.operations.str.akstr_match_like import * | ||
from awkward.operations.str.akstr_match_substring import * | ||
from awkward.operations.str.akstr_match_substring_regex import * | ||
from awkward.operations.str.akstr_starts_with import * | ||
|
||
|
||
def _get_ufunc_action( | ||
utf8_function, | ||
ascii_function, | ||
*args, | ||
bytestring_to_string=False, | ||
**kwargs, | ||
): | ||
from awkward.operations.ak_from_arrow import from_arrow | ||
from awkward.operations.ak_to_arrow import to_arrow | ||
|
||
def action(layout, **absorb): | ||
if layout.is_list and layout.parameter("__array__") == "string": | ||
return from_arrow( | ||
utf8_function(to_arrow(layout, extensionarray=False), *args, **kwargs), | ||
highlevel=False, | ||
) | ||
|
||
elif layout.is_list and layout.parameter("__array__") == "bytestring": | ||
if bytestring_to_string: | ||
out = from_arrow( | ||
ascii_function( | ||
to_arrow( | ||
layout.copy( | ||
content=layout.content.copy( | ||
parameters={"__array__": "char"} | ||
), | ||
parameters={"__array__": "string"}, | ||
), | ||
extensionarray=False, | ||
), | ||
*args, | ||
**kwargs, | ||
), | ||
highlevel=False, | ||
) | ||
if out.is_list and out.parameter("__array__") == "string": | ||
out = out.copy( | ||
content=out.content.copy(parameters={"__array__": "byte"}), | ||
parameters={"__array__": "bytestring"}, | ||
) | ||
return out | ||
|
||
else: | ||
return from_arrow( | ||
ascii_function( | ||
to_arrow(layout, extensionarray=False), *args, **kwargs | ||
), | ||
highlevel=False, | ||
) | ||
|
||
return action | ||
|
||
|
||
def _erase_list_option(layout): | ||
from awkward.contents.unmaskedarray import UnmaskedArray | ||
|
||
assert layout.is_list | ||
if layout.content.is_option: | ||
assert isinstance(layout.content, UnmaskedArray) | ||
return layout.copy(content=layout.content.content) | ||
else: | ||
return layout | ||
|
||
|
||
def _get_split_action( | ||
utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs | ||
): | ||
from awkward.operations.ak_from_arrow import from_arrow | ||
from awkward.operations.ak_to_arrow import to_arrow | ||
|
||
def action(layout, **absorb): | ||
if layout.is_list and layout.parameter("__array__") == "string": | ||
return _erase_list_option( | ||
from_arrow( | ||
utf8_function( | ||
to_arrow(layout, extensionarray=False), | ||
*args, | ||
**kwargs, | ||
), | ||
highlevel=False, | ||
) | ||
) | ||
|
||
elif layout.is_list and layout.parameter("__array__") == "bytestring": | ||
if bytestring_to_string: | ||
out = _erase_list_option( | ||
from_arrow( | ||
ascii_function( | ||
to_arrow( | ||
layout.copy( | ||
content=layout.content.copy( | ||
parameters={"__array__": "char"} | ||
), | ||
parameters={"__array__": "string"}, | ||
), | ||
extensionarray=False, | ||
), | ||
*args, | ||
**kwargs, | ||
), | ||
highlevel=False, | ||
) | ||
) | ||
assert out.is_list | ||
|
||
assert ( | ||
out.content.is_list | ||
and out.content.parameter("__array__") == "string" | ||
) | ||
return out.copy( | ||
content=out.content.copy( | ||
content=out.content.content.copy( | ||
parameters={"__array__": "byte"} | ||
), | ||
parameters={"__array__": "bytestring"}, | ||
), | ||
) | ||
|
||
else: | ||
return _erase_list_option( | ||
from_arrow( | ||
ascii_function( | ||
to_arrow(layout, extensionarray=False), *args, **kwargs | ||
), | ||
highlevel=False, | ||
) | ||
) | ||
|
||
return action |
Oops, something went wrong.