Skip to content

Commit

Permalink
eclcompress: deprecate providing a keyword regex
Browse files Browse the repository at this point in the history
  • Loading branch information
mferrera committed Nov 20, 2023
1 parent e92301f commit 585242e
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 89 deletions.
4 changes: 1 addition & 3 deletions docs/scripts/eclcompress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,7 @@ Notes
compressed file into account.
- Eclipse loading time of the compressed file is probably reduced by the
same factor as the compression factor.
- Only known compressable keywords are compressed by default. If you wish
to specify particular keywords or a regex instead this can be provided
directly through the command line.
- Only known compressable keywords are compressed.


Possible improvements
Expand Down
59 changes: 12 additions & 47 deletions src/subscript/eclcompress/eclcompress.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@
import itertools
import logging
import os
import re
import shutil
import textwrap
import warnings
from pathlib import Path
from typing import List, Optional, Pattern, Tuple, Union
from typing import List, Tuple, Union

import subscript

Expand Down Expand Up @@ -79,7 +77,6 @@ def eclcompress(
files: Union[str, List[str]],
keeporiginal: bool = False,
dryrun: bool = False,
eclkw_regexp: Optional[str] = None,
) -> int:
"""Run-length encode a set of grdecl files.
Expand All @@ -89,21 +86,14 @@ def eclcompress(
files: Filenames to be compressed
keeporiginal: Whether to copy the original to a backup file
dryrun: If true, only print compression efficiency
eclkw_regexp: Regular expression for locating Eclipse keywords.
Default is [A-Z]{2-8}$
Returns:
Number of bytes saved by compression.
"""

if not isinstance(files, list):
files = [files] # List with one element

if eclkw_regexp:
warnings.warn(
"Providing a keyword regex will be removed in Komodo 2023.11.",
DeprecationWarning,
)

totalsavings = 0

for filename in files:
Expand Down Expand Up @@ -137,7 +127,7 @@ def eclcompress(
# Index the list of strings (the file contents) by the line numbers
# where Eclipse keywords start, and where the first data record of the keyword
# ends (compression is not attempted in record 2 and onwards for any keyword)
keywordsets = find_keyword_sets(filelines, eclkw_regexp=eclkw_regexp)
keywordsets = find_keyword_sets(filelines)

if not keywordsets:
logger.info(
Expand Down Expand Up @@ -299,9 +289,7 @@ def compress_multiple_keywordsets(
return compressedlines


def find_keyword_sets(
filelines: List[str], eclkw_regexp: Optional[Union[str, Pattern[str]]] = None
) -> List[Tuple[int, int]]:
def find_keyword_sets(filelines: List[str]) -> List[Tuple[int, int]]:
"""Parse list of strings, looking for Eclipse data sets that we want.
Example:
Expand Down Expand Up @@ -335,9 +323,6 @@ def find_keyword_sets(
Args:
filelines: Eclipse deck lines (not necessarily complete decks)
eclkw_regexp: Regular expression for locating Eclipse keywords.
Default is None, in which it uses a predefined allowlist of
keywords
Return:
2-tuples, with start and end line indices for datasets to
Expand All @@ -346,26 +331,19 @@ def find_keyword_sets(
"""
keywordsets = []
kwstart = None
if eclkw_regexp:
eclkw_regexp = re.compile(eclkw_regexp)

for lineidx, line in enumerate(filelines):
line = line.strip()
if not line:
continue
if eclkw_regexp:
if re.match(eclkw_regexp, line) and line not in DENYLIST_KEYWORDS:
kwstart = lineidx
continue
else:
# Remove embracing quotes if in a multi-keyword
keyword = line.split(" ")[0].strip("'")
if keyword in ALLOWLIST_KEYWORDS:
kwstart = lineidx
if "/" in line:
keywordsets.append((kwstart, lineidx))
kwstart = None
continue
# Remove embracing quotes if in a multi-keyword
keyword = line.split(" ")[0].strip("'")
if keyword in ALLOWLIST_KEYWORDS:
kwstart = lineidx
if "/" in line:
keywordsets.append((kwstart, lineidx))
kwstart = None
continue
if kwstart is not None and line[0:2] == "--":
# This means we found a comment section within a data set
# In that case it is vital to preserve the current line
Expand Down Expand Up @@ -453,14 +431,6 @@ def get_parser() -> argparse.ArgumentParser:
"no files are specified on the command line."
),
)
parser.add_argument(
"--eclkw_regexp",
help=(
"Regular expression to determine which Eclipse keyword "
"to recognize. Default is None, using instead a list of known "
"compressable keywords."
),
)
parser.add_argument(
"--version",
action="version",
Expand Down Expand Up @@ -506,7 +476,6 @@ def main():
args.files,
keeporiginal=args.keeporiginal,
dryrun=args.dryrun,
eclkw_regexp=args.eclkw_regexp,
)


Expand All @@ -515,7 +484,6 @@ def main_eclcompress(
wildcardfile: str,
keeporiginal: bool = False,
dryrun: bool = False,
eclkw_regexp: Optional[str] = None,
) -> None:
"""Implements the command line functionality
Expand All @@ -525,8 +493,6 @@ def main_eclcompress(
keeporiginal: Whether a backup file should be left behind
dryrun: Nothing written to disk, only statistics for
compression printed to terminal.
eclkw_regexp: Regular expression for locating Eclipse keywords.
Default is None
"""
# A list of wildcards on the command line should always be compressed:
if grdeclfiles:
Expand Down Expand Up @@ -558,7 +524,6 @@ def main_eclcompress(
globbedfiles,
keeporiginal=keeporiginal,
dryrun=dryrun,
eclkw_regexp=eclkw_regexp,
)
savings_mb = savings / 1024.0 / 1024.0
print(f"eclcompress finished. Saved {savings_mb:.1f} Mb from compression")
Expand Down
39 changes: 0 additions & 39 deletions tests/test_eclcompress.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,45 +622,6 @@ def test_glob_patterns(tmp_path):
parse_wildcardfile("notthere")


def test_eclkw_regexp(tmp_path, mocker):
"""Test that custom regular expressions can be supplied to compress
otherwise unknown (which implies no compression) keywords"""
os.chdir(tmp_path)

uncompressed_str = "G1\n0 0 0 0 0 0 0 0 0 0 0 0 0\n/"

# Nothing is found by default here.
assert not find_keyword_sets(uncompressed_str.split())

# Only if we specify a regexp catching this odd keyword name:

kw_sets = find_keyword_sets(uncompressed_str.split(), eclkw_regexp="G1")
kwend_idx = len(uncompressed_str.split()) - 1
assert kw_sets == [(0, kwend_idx)]
assert compress_multiple_keywordsets(kw_sets, uncompressed_str.split()) == [
"G1",
" 13*0",
"/",
]

Path("g1.grdecl").write_text(uncompressed_str, encoding="utf8")

# Alternative regexpes that should also work with this G1:
kw_sets = find_keyword_sets(
uncompressed_str.split(), eclkw_regexp="[A-Z]{1-8}$"
) == [(0, kwend_idx)]

kw_sets = find_keyword_sets(
uncompressed_str.split(), eclkw_regexp="[A-Z0-9]{2-8}$"
) == [(0, kwend_idx)]

mocker.patch("sys.argv", ["eclcompress", "g1.grdecl", "--eclkw_regexp", "G1"])
main()
compressed = Path("g1.grdecl").read_text(encoding="utf8")
assert "File compressed with eclcompress" in compressed
assert "13*0" in compressed


def test_binary_example_file(tmp_path, mocker):
"""Test that a particular binary file is not touched by eclcompress
Expand Down

0 comments on commit 585242e

Please sign in to comment.