From 3190f8ed91d16d8ba6582477cf8c2e8b930fe9ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 28 Apr 2023 00:19:21 +0200 Subject: [PATCH] Dropped support for Python2 + Applied minor changes --- .coveragerc | 51 +- .github/workflows/python-package.yml | 158 +- docs/pages/cli.md | 366 ++-- docs/pages/enc/base.md | 346 ++- docs/pages/enc/binary.md | 334 ++- docs/pages/enc/common.md | 140 +- docs/pages/enc/compressions.md | 2 - docs/pages/enc/crypto.md | 410 ++-- docs/pages/enc/hashing.md | 2 - docs/pages/enc/languages.md | 396 ++-- docs/pages/enc/stegano.md | 244 +-- docs/pages/enc/web.md | 78 +- docs/pages/features.md | 674 +++--- docs/pages/guessing.md | 342 ++- docs/pages/howto.md | 482 ++-- docs/pages/index.md | 20 +- docs/pages/manipulations.md | 149 +- pyproject.toml | 10 +- pytest.ini | 2 + src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 3037 +++++++++++++------------- src/codext/__init__.py | 512 ++--- src/codext/base/_base.py | 581 +++-- src/codext/base/base100.py | 103 +- src/codext/base/base122.py | 204 +- src/codext/base/base85.py | 371 ++-- src/codext/binary/baudot.py | 576 +++-- src/codext/binary/rotate.py | 103 +- src/codext/common/cases.py | 5 +- src/codext/compressions/pkzip.py | 111 +- src/codext/crypto/railfence.py | 192 +- src/codext/hashing/blake.py | 22 +- src/codext/hashing/crypt.py | 4 +- src/codext/hashing/md.py | 4 +- src/codext/hashing/sha.py | 23 +- src/codext/hashing/shake.py | 22 +- src/codext/languages/braille.py | 67 +- src/codext/languages/galactic.py | 5 +- src/codext/languages/tap.py | 77 +- src/codext/others/uuencode.py | 2 +- src/codext/stegano/hexagram.py | 76 +- src/codext/web/html.py | 580 +++-- tests/test_base.py | 471 ++-- tests/test_common.py | 493 ++--- tests/test_generated.py | 297 +-- tests/test_manual.py | 340 ++- 46 files changed, 6200 insertions(+), 6286 deletions(-) create mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc index 4ccc970..b677975 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,27 +1,24 @@ -[run] -source = codext -omit = - codext/__info__.py - codext/**/__init__.py - -[report] -exclude_lines = - pragma: no cover - if.*?__name__.*?==.*?.__main__.: - def main\(\)\: - def __stdin_pipe\(\)\: - for line in __stdin_pipe\(\)\: - def __format_list\(items, include\=True\)\: - def __print_tabular\(lst, space\=4\)\: - except ImportError: - except NameError: - raise NotImplementedError - if not PY3 - if PY3 - def encode\(self, input, final\=False\)\: - def decode\(self, input, final\=False\)\: - def _detect\(text\)\: - def _lang\(lang\)\: - if stopfunc\.LANG_BACKEND\: - def _validate\(stop_function, lang_backend\=\"none\"\)\: - except KeyboardInterrupt\: +[run] +source = codext +omit = + src/codext/__info__.py + src/codext/**/__init__.py + +[report] +exclude_lines = + pragma: no cover + if.*?__name__.*?==.*?.__main__.: + def main\(\)\: + def __stdin_pipe\(\)\: + for line in __stdin_pipe\(\)\: + def __format_list\(items, include\=True\)\: + def __print_tabular\(lst, space\=4\)\: + except ImportError: + except NameError: + raise NotImplementedError + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: + if alt and len\(t\) \% 2 \=\= 1\: diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9010fab..62476a7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,79 +1,79 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: build - -env: - package: codext - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coverage - pip install -r requirements.txt - pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test ${{ env.package }} with pytest - run: | - pytest --cov=$package - coverage: - needs: build - runs-on: ubuntu-latest - env: - cov_badge_path: docs/coverage.svg - steps: - - uses: actions/checkout@v3 - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install pytest pytest-cov - pip install -r requirements.txt - pip install . - - name: Make coverage badge for ${{ env.package }} - run: | - pip install genbadge[coverage] - pytest --cov=$package --cov-report=xml - genbadge coverage -i coverage.xml -o $cov_badge_path - - name: Verify Changed files - uses: tj-actions/verify-changed-files@v12 - id: changed_files - with: - files: ${{ env.cov_badge_path }} - - name: Commit files - if: steps.changed_files.outputs.files_changed == 'true' - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - git add $cov_badge_path - git commit -m "Updated coverage.svg" - - name: Push changes - if: steps.changed_files.outputs.files_changed == 'true' - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.github_token }} - branch: ${{ github.ref }} +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +env: + package: codext + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest pytest-cov pytest-pythonpath coverage + pip install -r requirements.txt + pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: build + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + steps: + - uses: actions/checkout@v3 + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov pytest-pythonpath + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v12 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Commit files + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add $cov_badge_path + git commit -m "Updated coverage.svg" + - name: Push changes + if: steps.changed_files.outputs.files_changed == 'true' + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.github_token }} + branch: ${{ github.ref }} diff --git a/docs/pages/cli.md b/docs/pages/cli.md index 111913c..4b22cd4 100644 --- a/docs/pages/cli.md +++ b/docs/pages/cli.md @@ -1,184 +1,182 @@ -## CLI Tool - -`codext` has a Command-Line Interface tool. - ------ - -### Using Codext from the terminal - -The help message describes everything to know: - -```sh -usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... - -Codecs Extension (CodExt) 1.8.1 - -Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) -Copyright: © 2019-2021 A. D'Hondt -License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) -Source : https://github.com/dhondta/python-codext - -This tool allows to encode/decode input strings/files with an extended set of codecs. - -positional arguments: - {encode,decode,guess,search} - command to be executed - encode encode input using the specified codecs - decode decode input using the specified codecs - guess try guessing the decoding codecs - search search for codecs - -optional arguments: - -h, --help show this help message and exit - -i INFILE, --input-file INFILE - input file (if none, take stdin as input) - -o OUTFILE, --output-file OUTFILE - output file (if none, display result to stdout) - -s, --strip-newlines strip newlines from input - -usage examples: -- codext search bitcoin -- codext decode base32 -i file.b32 -- codext encode morse < to_be_encoded.txt -- echo "test" | codext encode base100 -- echo -en "test" | codext encode braille -o test.braille -- codext encode base64 < to_be_encoded.txt > text.b64 -- echo -en "test" | codext encode base64 | codext encode base32 -- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 -- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower -- echo -en "test" | codext encode upper reverse base32 base64 morse -- echo -en "test" | codext encode base64 gzip | codext guess -- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base -``` - -!!! note "Input/output" - - STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. - - Unless an output file is specified, the result is displayed in STDOUT. - -!!! note "Encodings chaining" - - Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. - -### Execution examples - -**Scenario 1**: 2-stages encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 -pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 -``` - -From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -**Scenario 2**: Multi-stage-encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse -.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- -``` - -When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse -hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== -``` - -In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase -HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== -``` - -Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag -Codecs: base32, barbie -A somewhat weird F1@9 ! -``` - -**Scenario 3**: Base-encoded rotated shifted secret (English) message - -Creating the payload: - -```session -$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 -NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= -``` - -First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank -[+] 1.00002: base62 -[+] 0.99401: base64 -[+] 0.70806: rotate-1 -[+] 0.70806: rotate-2 -[+] 0.70806: rotate-3 -[+] 0.70806: rotate-4 -[+] 0.70806: rotate-5 -[+] 0.70806: rotate-6 -[+] 0.70806: rotate-7 -[+] 0.70806: rotate-left-1 - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 -%¤q ´!.[æ&[fÿhbð^ - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 -h4nRqFifSnRjFfQxRHuVpxjxpP8cCR -``` - -Afterwards, we can still try to simplify ; - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank -[+] 1.00185: base58 -[+] 0.99091: base62 -[+] 0.67001: rotate-1 -[+] 0.67001: rotate-2 -[+] 0.67001: rotate-3 -[+] 0.67001: rotate-4 -[+] 0.67001: rotate-5 -[+] 0.67001: rotate-6 -[+] 0.67001: rotate-7 -[+] 0.67001: rotate-left-1 -``` - -From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en -[...] -[+] rotate-2, rot-1: My!super!secret!string -[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk -[+] rotate-2, shift-1: My super secret string -[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T -[...] -[+] rotate-left-6, shift-1: My super secret string -^C^C^C -``` - -We can then stop the research with Ctrl+C. The right output has been found ! - +`codext` has a Command-Line Interface tool. + +----- + +### Using Codext from the terminal + +The help message describes everything to know: + +```sh +usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... + +Codecs Extension (CodExt) 1.8.1 + +Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) +Copyright: © 2019-2021 A. D'Hondt +License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) +Source : https://github.com/dhondta/python-codext + +This tool allows to encode/decode input strings/files with an extended set of codecs. + +positional arguments: + {encode,decode,guess,search} + command to be executed + encode encode input using the specified codecs + decode decode input using the specified codecs + guess try guessing the decoding codecs + search search for codecs + +optional arguments: + -h, --help show this help message and exit + -i INFILE, --input-file INFILE + input file (if none, take stdin as input) + -o OUTFILE, --output-file OUTFILE + output file (if none, display result to stdout) + -s, --strip-newlines strip newlines from input + +usage examples: +- codext search bitcoin +- codext decode base32 -i file.b32 +- codext encode morse < to_be_encoded.txt +- echo "test" | codext encode base100 +- echo -en "test" | codext encode braille -o test.braille +- codext encode base64 < to_be_encoded.txt > text.b64 +- echo -en "test" | codext encode base64 | codext encode base32 +- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower +- echo -en "test" | codext encode upper reverse base32 base64 morse +- echo -en "test" | codext encode base64 gzip | codext guess +- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base +``` + +!!! note "Input/output" + + STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. + + Unless an output file is specified, the result is displayed in STDOUT. + +!!! note "Encodings chaining" + + Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. + +### Execution examples + +**Scenario 1**: 2-stages encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 +pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 +``` + +From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +**Scenario 2**: Multi-stage-encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse +.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- +``` + +When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse +hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== +``` + +In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase +HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== +``` + +Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag +Codecs: base32, barbie +A somewhat weird F1@9 ! +``` + +**Scenario 3**: Base-encoded rotated shifted secret (English) message + +Creating the payload: + +```session +$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 +NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= +``` + +First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank +[+] 1.00002: base62 +[+] 0.99401: base64 +[+] 0.70806: rotate-1 +[+] 0.70806: rotate-2 +[+] 0.70806: rotate-3 +[+] 0.70806: rotate-4 +[+] 0.70806: rotate-5 +[+] 0.70806: rotate-6 +[+] 0.70806: rotate-7 +[+] 0.70806: rotate-left-1 + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 +%¤q ´!.[æ&[fÿhbð^ + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 +h4nRqFifSnRjFfQxRHuVpxjxpP8cCR +``` + +Afterwards, we can still try to simplify ; + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank +[+] 1.00185: base58 +[+] 0.99091: base62 +[+] 0.67001: rotate-1 +[+] 0.67001: rotate-2 +[+] 0.67001: rotate-3 +[+] 0.67001: rotate-4 +[+] 0.67001: rotate-5 +[+] 0.67001: rotate-6 +[+] 0.67001: rotate-7 +[+] 0.67001: rotate-left-1 +``` + +From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en +[...] +[+] rotate-2, rot-1: My!super!secret!string +[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk +[+] rotate-2, shift-1: My super secret string +[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T +[...] +[+] rotate-left-6, shift-1: My super secret string +^C^C^C +``` + +We can then stop the research with Ctrl+C. The right output has been found ! + diff --git a/docs/pages/enc/base.md b/docs/pages/enc/base.md index 757965e..dc7b26c 100644 --- a/docs/pages/enc/base.md +++ b/docs/pages/enc/base.md @@ -1,174 +1,172 @@ -## Base - -`codext` defines a far broader set of Base-encodings than in the original library. - ------ - -### Classical base 2^N encodings - -This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. - -Common base encodings with N a power of 2: - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) -`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 -`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | - -!!! note "Aliases" - - All the aliases are case insensitive for base encodings. - -```python ->>> codext.encode("test", "base2") -'01110100011001010111001101110100' ->>> codext.encode("test", "base2-inv") -'10001011100110101000110010001011' -``` - -```python ->>> codecs.encode("this is a test", "base16") -'7468697320697320612074657374' ->>> codecs.decode("7468697320697320612074657374", "base16") -'this is a test' ->>> codecs.encode("this is a test", "base16-inv") -'1E02031DCA031DCA0BCA1E0F1D1E' -``` - -```python ->>> codext.encode("this is a test", "base32") -'ORUGS4ZANFZSAYJAORSXG5A=' ->>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") -'this is a test' -``` - -Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. - -```python ->>> codecs.encode("this is a test", "base64") -'dGhpcyBpcyBhIHRlc3Q=' ->>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") -'this is a test' -``` - ------ - -### Generic base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) -`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | -`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | -`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | -`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL -`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | -`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | -`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | -`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 - -```python ->>> codext.encode("test", "base3") -'23112113223321323322' -``` - -```python ->>> codecs.encode("test", "base36") -'WANEK4' ->>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") -'this is a test' -``` - -```python ->>> codext.encode("this is a test!", "base45") -'AWE+EDH44.OEOCC7WE QEX0' ->>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") -'this is a test!' -``` - -```python ->>> codext.encode("this is a test", "base58") -'jo91waLQA1NNeBmZKUF' ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' -``` - -```python ->>> codecs.encode("test", "base62") -'289lyu' ->>> codecs.encode("this is a test", "base62") -'CsoB4HQ5gmgMyCenF7E' -``` - -```python ->>> codecs.encode("This is a test !", "base91") -'nX,<:WRT%yxth90oZB^C' ->>> codext.encode("This is a test !", "base91-alt") -'?a&[jv4S3Wg>,71@Jo#K' -``` - -!!! note "Generic encodings" - - Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. - - :::python - >>> codext.encode("test", "base3-generic") - '12001002112210212211' - >>> codext.encode("test", "base17-generic") - '4cf60456' - ------ - -### Base85 - -This encoding implements various different versions of Base85. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | - -```python ->>> codext.encode("this is a test", "ascii85") -"FD,B0+DGm>@3BZ'F*%" ->>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") -'this is a test' ->>> with open("ascii85.txt", 'w', encoding="ascii85") as f: - f.write("this is a test") -14 ->>> with open("ascii85.txt", encoding="ascii85") as f: - f.read() -'this is a test' -``` - ------ - -### Other base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only -`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only -`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset - -```python ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' -``` - -```python ->>> codecs.encode("this is a test", "base122") -':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' ->>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") -'this is a test' -``` - +`codext` defines a far broader set of Base-encodings than in the original library. + +----- + +### Classical base 2^N encodings + +This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. + +Common base encodings with N a power of 2: + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) +`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 +`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | + +!!! note "Aliases" + + All the aliases are case insensitive for base encodings. + +```python +>>> codext.encode("test", "base2") +'01110100011001010111001101110100' +>>> codext.encode("test", "base2-inv") +'10001011100110101000110010001011' +``` + +```python +>>> codecs.encode("this is a test", "base16") +'7468697320697320612074657374' +>>> codecs.decode("7468697320697320612074657374", "base16") +'this is a test' +>>> codecs.encode("this is a test", "base16-inv") +'1E02031DCA031DCA0BCA1E0F1D1E' +``` + +```python +>>> codext.encode("this is a test", "base32") +'ORUGS4ZANFZSAYJAORSXG5A=' +>>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") +'this is a test' +``` + +Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. + +```python +>>> codecs.encode("this is a test", "base64") +'dGhpcyBpcyBhIHRlc3Q=' +>>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") +'this is a test' +``` + +----- + +### Generic base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | +`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL +`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | +`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | +`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | +`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 + +```python +>>> codext.encode("test", "base3") +'23112113223321323322' +``` + +```python +>>> codecs.encode("test", "base36") +'WANEK4' +>>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") +'this is a test' +``` + +```python +>>> codext.encode("this is a test!", "base45") +'AWE+EDH44.OEOCC7WE QEX0' +>>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") +'this is a test!' +``` + +```python +>>> codext.encode("this is a test", "base58") +'jo91waLQA1NNeBmZKUF' +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' +``` + +```python +>>> codecs.encode("test", "base62") +'289lyu' +>>> codecs.encode("this is a test", "base62") +'CsoB4HQ5gmgMyCenF7E' +``` + +```python +>>> codecs.encode("This is a test !", "base91") +'nX,<:WRT%yxth90oZB^C' +>>> codext.encode("This is a test !", "base91-alt") +'?a&[jv4S3Wg>,71@Jo#K' +``` + +!!! note "Generic encodings" + + Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. + + :::python + >>> codext.encode("test", "base3-generic") + '12001002112210212211' + >>> codext.encode("test", "base17-generic") + '4cf60456' + +----- + +### Base85 + +This encoding implements various different versions of Base85. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | + +```python +>>> codext.encode("this is a test", "ascii85") +"FD,B0+DGm>@3BZ'F*%" +>>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") +'this is a test' +>>> with open("ascii85.txt", 'w', encoding="ascii85") as f: + f.write("this is a test") +14 +>>> with open("ascii85.txt", encoding="ascii85") as f: + f.read() +'this is a test' +``` + +----- + +### Other base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only +`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset + +```python +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' +``` + +```python +>>> codecs.encode("this is a test", "base122") +':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' +>>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") +'this is a test' +``` + diff --git a/docs/pages/enc/binary.md b/docs/pages/enc/binary.md index 745ef82..0ed7fb0 100644 --- a/docs/pages/enc/binary.md +++ b/docs/pages/enc/binary.md @@ -1,168 +1,166 @@ -## Binary - -`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. - ------ - -### Baudot - -It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated -`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape - -!!! note "LSB / MSB" - - "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. - - -```python ->>> codext.encode("12345", "baudot-fr") -'010000000100010001000010100111' ->>> codext.decode("010000000100010001000010100111", "baudot-fr") -'12345' -``` - -```python ->>> codext.encode("TEST", "baudot-spaced_uk") -'10101 00010 10100 10101' ->>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") -'TEST' -``` - -```python ->>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") ->>> print(s) -***.** -* *. - . * -* .* -* .* -** . - *. -* .** -** . - * .* -* .* - * . * -** .** - **. * ->>> codext.decode(s, "baudot-tape_ita2") -'HELLO WORLD!' -``` - ------ - -### Binary Coded Decimal (BCD) - -It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | -`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | -`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | - -```python ->>> codext.encode("Test", "bcd") -'\x08A\x01\x11Q\x16' ->>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") -'Test' ->>> codext.encode("Test", "bcd_ext_zero") -'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' ->>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") -'Test' ->>> codext.encode("Test", "bcd_extended_ones") -'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' ->>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") -'Test' -``` - ------ - -### Excess-3 - -Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | - -```python ->>> codext.encode("This is a test!", "excess-3") -';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' ->>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") -'This is a test!' -``` - ------ - -### Gray - -Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | - -```python ->>> codext.encode("this is a test", "gray") -'N\\]J0]J0Q0NWJN' ->>> codext.decode("N\\]J0]J0Q0NWJN", "gray") -'this is a test' ->>> codext.encode("THIS IS A TEST", "gray") -'~lmz0mz0a0~gz~' ->>> codext.decode("~lmz0mz0a0~gz~", "gray") -'THIS IS A TEST' -``` - ------ - -### Manchester - -This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) -`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) - -```python ->>> codext.encode("This is a test!", "manchester") -'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' ->>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") -'This is a test!' ->>> codext.encode("This is a test!", "manchester-inverted") -'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' ->>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") -'This is a test!' -``` - ------ - -### Rotate N bits - -This codec rotates of N bits each byte of an input string. - -!!! note "Lossless" - - This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right - -```python ->>> codext.encode("test", "rotate-1") -':29:' ->>> codext.encode("test", "rotatebits-1") -':29:' ->>> codext.encode("test", "rotate_right-1") -':29:' ->>> codext.encode("test", "rotate_left_1") -'èÊæè' -``` - +`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. + +----- + +### Baudot + +It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated +`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape + +!!! note "LSB / MSB" + + "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. + + +```python +>>> codext.encode("12345", "baudot-fr") +'010000000100010001000010100111' +>>> codext.decode("010000000100010001000010100111", "baudot-fr") +'12345' +``` + +```python +>>> codext.encode("TEST", "baudot-spaced_uk") +'10101 00010 10100 10101' +>>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") +'TEST' +``` + +```python +>>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") +>>> print(s) +***.** +* *. + . * +* .* +* .* +** . + *. +* .** +** . + * .* +* .* + * . * +** .** + **. * +>>> codext.decode(s, "baudot-tape_ita2") +'HELLO WORLD!' +``` + +----- + +### Binary Coded Decimal (BCD) + +It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | +`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | +`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | + +```python +>>> codext.encode("Test", "bcd") +'\x08A\x01\x11Q\x16' +>>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") +'Test' +>>> codext.encode("Test", "bcd_ext_zero") +'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' +>>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") +'Test' +>>> codext.encode("Test", "bcd_extended_ones") +'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' +>>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") +'Test' +``` + +----- + +### Excess-3 + +Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | + +```python +>>> codext.encode("This is a test!", "excess-3") +';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' +>>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") +'This is a test!' +``` + +----- + +### Gray + +Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | + +```python +>>> codext.encode("this is a test", "gray") +'N\\]J0]J0Q0NWJN' +>>> codext.decode("N\\]J0]J0Q0NWJN", "gray") +'this is a test' +>>> codext.encode("THIS IS A TEST", "gray") +'~lmz0mz0a0~gz~' +>>> codext.decode("~lmz0mz0a0~gz~", "gray") +'THIS IS A TEST' +``` + +----- + +### Manchester + +This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) +`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) + +```python +>>> codext.encode("This is a test!", "manchester") +'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' +>>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") +'This is a test!' +>>> codext.encode("This is a test!", "manchester-inverted") +'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' +>>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") +'This is a test!' +``` + +----- + +### Rotate N bits + +This codec rotates of N bits each byte of an input string. + +!!! note "Lossless" + + This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right + +```python +>>> codext.encode("test", "rotate-1") +':29:' +>>> codext.encode("test", "rotatebits-1") +':29:' +>>> codext.encode("test", "rotate_right-1") +':29:' +>>> codext.encode("test", "rotate_left_1") +'èÊæè' +``` + diff --git a/docs/pages/enc/common.md b/docs/pages/enc/common.md index 34a566c..1739ca8 100644 --- a/docs/pages/enc/common.md +++ b/docs/pages/enc/common.md @@ -1,71 +1,69 @@ -## Common - -`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). - ------ - -### A1Z26 - -This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") - -```python ->>> codext.encode("This is a test", "a1z26") -'20-8-9-19 9-19 1 20-5-19-20' ->>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") -'this is a test' -``` - ------ - -### Octal - -This simple codec converts characters into their octal values. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded -`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded - -```python ->>> codext.encode("this is a test", "octal") -'164150151163040151163040141040164145163164' ->>> codext.decode("164150151163040151163040141040164145163164", "octals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "octal-spaced") -'164 150 151 163 40 151 163 40 141 40 164 145 163 164' ->>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") -'this is a test' -``` - ------ - -### Ordinal - -This simple codec converts characters into their ordinals. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded -`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded - -```python ->>> codext.encode("this is a test", "ordinal") -'116104105115032105115032097032116101115116' ->>> codext.decode("116104105115032105115032097032116101115116", "ordinals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "ordinal-spaced") -'116 104 105 115 32 105 115 32 97 32 116 101 115 116' ->>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") -'this is a test' -``` - +`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). + +----- + +### A1Z26 + +This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") + +```python +>>> codext.encode("This is a test", "a1z26") +'20-8-9-19 9-19 1 20-5-19-20' +>>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") +'this is a test' +``` + +----- + +### Octal + +This simple codec converts characters into their octal values. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded +`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded + +```python +>>> codext.encode("this is a test", "octal") +'164150151163040151163040141040164145163164' +>>> codext.decode("164150151163040151163040141040164145163164", "octals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "octal-spaced") +'164 150 151 163 40 151 163 40 141 40 164 145 163 164' +>>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") +'this is a test' +``` + +----- + +### Ordinal + +This simple codec converts characters into their ordinals. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded +`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded + +```python +>>> codext.encode("this is a test", "ordinal") +'116104105115032105115032097032116101115116' +>>> codext.decode("116104105115032105115032097032116101115116", "ordinals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "ordinal-spaced") +'116 104 105 115 32 105 115 32 97 32 116 101 115 116' +>>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") +'this is a test' +``` + diff --git a/docs/pages/enc/compressions.md b/docs/pages/enc/compressions.md index a5437cf..5c4fd2e 100644 --- a/docs/pages/enc/compressions.md +++ b/docs/pages/enc/compressions.md @@ -1,5 +1,3 @@ -## Compressions - `codext` provides a few common compression codecs. ----- diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index e59ab0f..b189c0e 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -1,206 +1,204 @@ -## Cryptography - -`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. - -!!! note "Available masks" - - Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". - - `a`: printable characters - `b`: all 8-bits chars - `d`: digits - `h`: lowercase hexadecimal - `H`: uppercase hexadecimal - `l`: lowercase letters - `p`: punctuation characters - `s`: whitespace - `u`: uppercase letters - - When combining masks, only one occurrence of each character is taken in the final alphabet. - - So, for instance, the following masks yield the following alphabets: - - - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" - - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" - ------ - -### Affine Cipher - -This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` - -```python ->>> codext.encode("this is a test", "affine") -'vjkubkubcbvguv' ->>> codext.decode("vjkubkubcbvguv", "affine") -'this is a test' ->>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") -'ORWJdWJdidOCJO' ->>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") -'this is a test' ->>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") -'AW1 D1 D2DAH A' ->>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") -'THIS IS A TEST' -``` - -!!! warning "Parameters `a` and `b`" - - Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. - ------ - -### Atbash Cipher - -It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" - -```python ->>> codext.encode("this is a test", "atbash") -'gsrh rh z gvhg' ->>> codext.encode("this is a test", "atbash-[?l?u?p?s]") -'.^]/a]/a a.{/.' ->>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") -'this is a test' -``` - ------ - -### Baconian Cipher - -It support only letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) - -```python ->>> codext.encode("this is a test", "bacon") -'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' ->>> codext.encode("this is a test", "bacon_01") -'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' ->>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") -'THIS IS A TEST' -``` - ------ - -### Barbie Typewriter - -It implements the cipher for its 4 different keys. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` - -```python ->>> codext.encode("this is a test", "barbie-1") -'hstf tf i hafh' ->>> codext.encode("this is a test", "barbie_3") -'fpsu su h ftuf' ->>> codext.decode("fpsu su h ftuf", "barbie-3") -'this is a test' -``` - ------ - -### Citrix CTX1 - -This implements the Citrix CTX1 password encoding algorithm. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | - -```python ->>> codext.encode("this is a test", "citrix-ctx1") -'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' ->>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") -'this is a test' -``` - ------ - -### Rail Fence Cipher - -This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | - -```python ->>> codext.encode("this is a test", "zigzag") -'t ashsi etist' ->>> codext.encode("this is a test", "rail-5-3") -'it sss etiath ' ->>> codext.decode("it sss etiath ", "zigzag_5-3") -'this is a test' -``` - ------ -### ROT N - -This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ -`rot47` | text <-> rot47 ciphertext | | - -```python ->>> codext.encode("this is a test", "rot-15") -'iwxh xh p ithi' ->>> codext.encode("iwxh xh p ithi", "rot20") -'cqrb rb j cnbc' ->>> codext.decode("cqrb rb j cnbc", "rot_9") -'this is a test' -``` - ------ - -### Shift - -This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "shift-3") -'wklv#lv#d#whvw' ->>> codext.decode("wklv#lv#d#whvw", "shift10") -'mabl\x19bl\x19Z\x19m^lm' ->>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") -'this is a test' -``` - ------ - -### XOR with 1 byte - -This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "xor-10") -'~bcy*cy*k*~oy~' ->>> codext.encode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.decode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.encode("~bcy*cy*k*~oy~", "xor-10") -'this is a test' -``` - +`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. + +!!! note "Available masks" + + Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". + + `a`: printable characters + `b`: all 8-bits chars + `d`: digits + `h`: lowercase hexadecimal + `H`: uppercase hexadecimal + `l`: lowercase letters + `p`: punctuation characters + `s`: whitespace + `u`: uppercase letters + + When combining masks, only one occurrence of each character is taken in the final alphabet. + + So, for instance, the following masks yield the following alphabets: + + - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" + - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" + +----- + +### Affine Cipher + +This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` + +```python +>>> codext.encode("this is a test", "affine") +'vjkubkubcbvguv' +>>> codext.decode("vjkubkubcbvguv", "affine") +'this is a test' +>>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") +'ORWJdWJdidOCJO' +>>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") +'this is a test' +>>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") +'AW1 D1 D2DAH A' +>>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") +'THIS IS A TEST' +``` + +!!! warning "Parameters `a` and `b`" + + Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. + +----- + +### Atbash Cipher + +It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" + +```python +>>> codext.encode("this is a test", "atbash") +'gsrh rh z gvhg' +>>> codext.encode("this is a test", "atbash-[?l?u?p?s]") +'.^]/a]/a a.{/.' +>>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") +'this is a test' +``` + +----- + +### Baconian Cipher + +It support only letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) + +```python +>>> codext.encode("this is a test", "bacon") +'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' +>>> codext.encode("this is a test", "bacon_01") +'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' +>>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") +'THIS IS A TEST' +``` + +----- + +### Barbie Typewriter + +It implements the cipher for its 4 different keys. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` + +```python +>>> codext.encode("this is a test", "barbie-1") +'hstf tf i hafh' +>>> codext.encode("this is a test", "barbie_3") +'fpsu su h ftuf' +>>> codext.decode("fpsu su h ftuf", "barbie-3") +'this is a test' +``` + +----- + +### Citrix CTX1 + +This implements the Citrix CTX1 password encoding algorithm. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | + +```python +>>> codext.encode("this is a test", "citrix-ctx1") +'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' +>>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") +'this is a test' +``` + +----- + +### Rail Fence Cipher + +This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | + +```python +>>> codext.encode("this is a test", "zigzag") +'t ashsi etist' +>>> codext.encode("this is a test", "rail-5-3") +'it sss etiath ' +>>> codext.decode("it sss etiath ", "zigzag_5-3") +'this is a test' +``` + +----- +### ROT N + +This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | + +```python +>>> codext.encode("this is a test", "rot-15") +'iwxh xh p ithi' +>>> codext.encode("iwxh xh p ithi", "rot20") +'cqrb rb j cnbc' +>>> codext.decode("cqrb rb j cnbc", "rot_9") +'this is a test' +``` + +----- + +### Shift + +This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "shift-3") +'wklv#lv#d#whvw' +>>> codext.decode("wklv#lv#d#whvw", "shift10") +'mabl\x19bl\x19Z\x19m^lm' +>>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") +'this is a test' +``` + +----- + +### XOR with 1 byte + +This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "xor-10") +'~bcy*cy*k*~oy~' +>>> codext.encode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.decode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.encode("~bcy*cy*k*~oy~", "xor-10") +'this is a test' +``` + diff --git a/docs/pages/enc/hashing.md b/docs/pages/enc/hashing.md index d1b0298..0f6f151 100644 --- a/docs/pages/enc/hashing.md +++ b/docs/pages/enc/hashing.md @@ -1,5 +1,3 @@ -## Hashing - `codext` provides hash functions through the `.encode(...)` API for convenience (e.g. while chaining codecs with [the CLI tool](../cli.html)). ----- diff --git a/docs/pages/enc/languages.md b/docs/pages/enc/languages.md index 3735d15..9aa805c 100644 --- a/docs/pages/enc/languages.md +++ b/docs/pages/enc/languages.md @@ -1,199 +1,197 @@ -## Languages - -`codext` also adds some common languages for encoding. - ------ - -### Braille - -It supports letters, digits and some special characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`braille` | text <-> braille symbols | | Python 3 only - -```python ->>> codext.encode("this is a test", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.encode("THIS IS A TEST", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") -'this is a test' -``` - ------ - -### Galactic - -This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only - -```python ->>> codext.encode("this is a test", "galactic") -'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' ->>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") -'this is a test' -``` - ------ - -### Ipsum - -This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum - -```python ->>> codext.encode("This is a test.", "ipsum") -'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' ->>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") -'This is a test.' -``` - ------ - -### Leetspeak - -This implements a very basic ruleset of elite speaking. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules - -```python ->>> codext.encode("this is a test", "leetspeak") -'7h15 15 4 7357' ->>> codext.decode("7h15 15 4 7357", "leetspeak") -'ThIS IS A TEST' -``` - ------ - -### Morse - -It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) - -```python ->>> codext.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse/-.") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse_ABC") -'B CCCC CC CCC A CC CCC A CB A B C CCC B' ->>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' ->>> with codext.open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 ->>> with codext.open("morse.txt", encoding="morse") as f: - f.read() -'this is a test' -``` - ------ - -### Navajo - -It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`navajo` | text <-> Navajo | | - -```python ->>> import codext ->>> codext.encode("this is a test 123", "navajo") -'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' ->>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") -'this is a test 123' -``` - ------ - -### Radio Alphabet - -This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | - -```python ->>> codext.encode("foobar", "nato_phonetic_alphabet") -'Foxtrot Oscar Oscar Bravo Alpha Romeo' ->>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") -'FOOBAR' -``` - ------ - -### Southpark - -This encodes text according to Kenny's language in Southpark. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) -`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) - -```python ->>> codext.encode("This is a Test", "southpark") -'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' ->>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") -'This is a Test' ->>> codext.encode("This is a test", "kenny_123456") -'245415411144111411144211444111145455144145' ->>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") -'This is a test' ->>> codext.encode("this is a test", "kenny_icase") -'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' ->>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") -'this is a test' ->>> codext.encode("this is a test", "southpark-icase_123") -'123213211122111211122111222111123233122123' ->>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") -'this is a test' -``` - ------ - -### Tap - -This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only - -```python ->>> codext.encode("this is a test", "tap") -'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' ->>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") -'this is a test' -``` - ------ - -### Tom-Tom - -This codec is similar to morse. It converts text into slashes and backslashes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator - -```python ->>> codext.encode("this is a test", "tom-tom") -'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' ->>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") -'THIS IS A TEST' -``` +`codext` also adds some common languages for encoding. + +----- + +### Braille + +It supports letters, digits and some special characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`braille` | text <-> braille symbols | | Python 3 only + +```python +>>> codext.encode("this is a test", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.encode("THIS IS A TEST", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") +'this is a test' +``` + +----- + +### Galactic + +This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only + +```python +>>> codext.encode("this is a test", "galactic") +'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' +>>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") +'this is a test' +``` + +----- + +### Ipsum + +This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum + +```python +>>> codext.encode("This is a test.", "ipsum") +'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' +>>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") +'This is a test.' +``` + +----- + +### Leetspeak + +This implements a very basic ruleset of elite speaking. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules + +```python +>>> codext.encode("this is a test", "leetspeak") +'7h15 15 4 7357' +>>> codext.decode("7h15 15 4 7357", "leetspeak") +'ThIS IS A TEST' +``` + +----- + +### Morse + +It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) + +```python +>>> codext.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse/-.") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse_ABC") +'B CCCC CC CCC A CC CCC A CB A B C CCC B' +>>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' +>>> with codext.open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 +>>> with codext.open("morse.txt", encoding="morse") as f: + f.read() +'this is a test' +``` + +----- + +### Navajo + +It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`navajo` | text <-> Navajo | | + +```python +>>> import codext +>>> codext.encode("this is a test 123", "navajo") +'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' +>>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") +'this is a test 123' +``` + +----- + +### Radio Alphabet + +This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | + +```python +>>> codext.encode("foobar", "nato_phonetic_alphabet") +'Foxtrot Oscar Oscar Bravo Alpha Romeo' +>>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") +'FOOBAR' +``` + +----- + +### Southpark + +This encodes text according to Kenny's language in Southpark. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) +`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) + +```python +>>> codext.encode("This is a Test", "southpark") +'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' +>>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") +'This is a Test' +>>> codext.encode("This is a test", "kenny_123456") +'245415411144111411144211444111145455144145' +>>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") +'This is a test' +>>> codext.encode("this is a test", "kenny_icase") +'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' +>>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") +'this is a test' +>>> codext.encode("this is a test", "southpark-icase_123") +'123213211122111211122111222111123233122123' +>>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") +'this is a test' +``` + +----- + +### Tap + +This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only + +```python +>>> codext.encode("this is a test", "tap") +'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' +>>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") +'this is a test' +``` + +----- + +### Tom-Tom + +This codec is similar to morse. It converts text into slashes and backslashes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator + +```python +>>> codext.encode("this is a test", "tom-tom") +'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' +>>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") +'THIS IS A TEST' +``` diff --git a/docs/pages/enc/stegano.md b/docs/pages/enc/stegano.md index 57dfb18..1a3a5fa 100644 --- a/docs/pages/enc/stegano.md +++ b/docs/pages/enc/stegano.md @@ -1,123 +1,121 @@ -## Steganography - -`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. - ------ - -### Hexagrams (I Ching) - -This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only - -```python ->>> codext.encode("this is a test", "hexagram") -'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' ->>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") -'this is a test' -``` - ------ - -### Klopf Code - -This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`klopf` | text <-> klopf encoded text | `klopfcode` | - -```python ->>> codext.encode("this is a test", "klopf") -'44324234 4234 11 44513444' ->>> codext.decode("44324234 4234 11 44513444", "klopf") -'THIS IS A TEST' -``` - ------ - -### Resistor Color Codes - -This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes - -```python ->>> codext.encode("1234", "resistor") -'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' ->>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") -'1234' -``` - ------ - -### Rick Cipher - -This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding - -```python ->>> codext.encode("Test String", "rick") -'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' ->>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") -'TEST STRING' -``` - ------ - -### SMS (T9) - -This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding - -```python ->>> codext.encode("this is a test", "sms") -'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' ->>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") -'this is a test' ->>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") -'this is a test' -``` - ------ - -### Whitespaces - -This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. - -!!! warning "Encoding, not programming !" - - This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones -`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -```python ->>> codext.encode("test", "whitespace") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces-inv") -' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' ->>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") -'test' -``` - -```python ->>> codext.encode("test", "whitespace+after-before") -' m \n l \n u \n m ' ->>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") -'test' -``` +`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. + +----- + +### Hexagrams (I Ching) + +This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only + +```python +>>> codext.encode("this is a test", "hexagram") +'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' +>>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") +'this is a test' +``` + +----- + +### Klopf Code + +This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`klopf` | text <-> klopf encoded text | `klopfcode` | + +```python +>>> codext.encode("this is a test", "klopf") +'44324234 4234 11 44513444' +>>> codext.decode("44324234 4234 11 44513444", "klopf") +'THIS IS A TEST' +``` + +----- + +### Resistor Color Codes + +This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes + +```python +>>> codext.encode("1234", "resistor") +'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' +>>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") +'1234' +``` + +----- + +### Rick Cipher + +This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding + +```python +>>> codext.encode("Test String", "rick") +'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' +>>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") +'TEST STRING' +``` + +----- + +### SMS (T9) + +This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding + +```python +>>> codext.encode("this is a test", "sms") +'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' +>>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") +'this is a test' +>>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") +'this is a test' +``` + +----- + +### Whitespaces + +This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. + +!!! warning "Encoding, not programming !" + + This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones +`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +```python +>>> codext.encode("test", "whitespace") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces-inv") +' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' +>>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") +'test' +``` + +```python +>>> codext.encode("test", "whitespace+after-before") +' m \n l \n u \n m ' +>>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") +'test' +``` diff --git a/docs/pages/enc/web.md b/docs/pages/enc/web.md index 80c6a20..4477a1f 100644 --- a/docs/pages/enc/web.md +++ b/docs/pages/enc/web.md @@ -1,40 +1,38 @@ -## Web - -`codext` implements some common Web-related encodings. - ------ - -### HTML Entities - -This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - -```python ->>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") -'Тħĩş Їś ą Ţêšŧ' ->>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") -'Тħĩş Їś ą Ţêšŧ' -``` - ------ - -### URL - -This handles URL encoding, regardless of the case when decoding and with no error. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`url` | text <-> URL encoded text | `url`, `urlencode` | - -```python ->>> codecs.encode("?=this/is-a_test/../", "url") -'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' ->>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") -'?=this/is-a_test/../' ->>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") -'?=this/is-a_test/../' -``` - +`codext` implements some common Web-related encodings. + +----- + +### HTML Entities + +This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) + +```python +>>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") +'Тħĩş Їś ą Ţêšŧ' +>>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") +'Тħĩş Їś ą Ţêšŧ' +``` + +----- + +### URL + +This handles URL encoding, regardless of the case when decoding and with no error. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`url` | text <-> URL encoded text | `url`, `urlencode` | + +```python +>>> codecs.encode("?=this/is-a_test/../", "url") +'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' +>>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") +'?=this/is-a_test/../' +>>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") +'?=this/is-a_test/../' +``` + diff --git a/docs/pages/features.md b/docs/pages/features.md index 11316f0..02b375b 100644 --- a/docs/pages/features.md +++ b/docs/pages/features.md @@ -1,338 +1,336 @@ -## Features - -Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. - -`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. - -!!! note "The `open` built-in function" - - Two behaviors are to be considered when using `codext`: - - 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. - 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. - - This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. - -!!! warning "Lossy conversion" - - Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. - ------ - -### Add a custom encoding - -New codecs can be added easily using the new function `add`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add in module codext.__common__: - -add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) - This adds a new codec to the codecs module setting its encode and/or decode - functions, eventually dynamically naming the encoding with a pattern and - with file handling (if text is True). - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the - built-in open(...) but will make it impossible - to remove the codec later - -``` - -Here is a simple example of how to add a basic codec: - -```python -import codext - -def mycodec_encode(text, errors="strict"): - # do some encoding stuff - return encoded, len(text) - -def mycodec_decode(text, errors="strict"): - # do some decoding stuff - return decoded, len(text) - -codext.add("mycodec", mycodec_encode, mycodec_decode) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. -- These functions always return a pair with the resulting string and the length of consumed input text. - -Another example for a more complex and dynamic codec: - -```python -import codext - -def mydyncodec_encode(i): - def encode(text, error="strict"): - # do somthing depending on i - return result, len(text) - return encode - -codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- Only the encoding function is defined. -- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. - -!!! warning "Pattern capture group" - - A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". - ------ - -### Add a custom map encoding - -New codecs using encoding maps can be added easily using the new function `add_map`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add_map in module codext.__common__: - -add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) - This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module - dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern - and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - -``` - -This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. - -Here is a simple example of how to add a map codec: - -```python -import codext - -ENCMAP = {'a': "A", 'b': "B", 'c': "C"} - -codext.add_map("mycodec", ENCMAP) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions do not have to be declared anymore. -- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. - -Another example for a more complex and dynamic codec: - -```python -import codext - -ENCMAP = [ - {'00': "A", '01': "B", '10': "C", '11': "D"}, - {'00': "D", '01': "C", '10': "B", '11': "A"}, -] - -codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. -- Instead of using the default character "`?`" for replacements, we use "`#`". -- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. -- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - ------ - -### Add a macro - -**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. - -Here is an example of adding a macro (and verifying it was indeed added): - -```python ->>> codext.list_macros() -['example-macro'] ->>> codext.add_macro("test-macro", "gzip", "base64") ->>> codext.list_macros() -['example-macro', 'test-macro'] -``` - -!!! note "Removing a macro" - - As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. - - :::python - >>> codext.remove("test-macro") - - If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. - ------ - -### List codecs - -Codecs can be listed with the `list` function, either the whole codecs or only some categories. - -```python ->>> codext.list() -['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] -``` - -!!! note "Codecs categories" - - - `native`: the built-in codecs from the original `codecs` package - - `non-native`: this special category regroups all the categories mentioned hereafter - - `base`: baseX codecs (e.g. `base`, `base100`) - - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) - - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) - - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) - - `language`: language-related codecs (e.g. `morse`, `navajo`) - - `other`: uncategorized codecs (e.g. `letters`, `url`) - - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) - - Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. - -```python ->>> codext.list("binary") -['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] ->>> codext.list("language") -['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] ->>> codext.list("native") -['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] -``` - -!!! warning "Codecs listed, not encodings" - - Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). - ------ - -### Search for encodings - -Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. - -```python ->>> codext.search("baudot") -['baudot', 'baudot_spaced', 'baudot_tape'] ->>> codext.search("al") -['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] ->>> codext.search("white") -['whitespace', 'whitespace_after_before'] -``` - -Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). - -```python ->>> codext.examples("rot") -['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] ->>> codext.examples("dna") -['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] ->>> codext.examples("barbie", 5) -['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] -``` - ------ - -### Remove a custom encoding or macro - -New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. - -```python ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' ->>> codext.remove("bin") ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -Trying to remove a codec that is in the native registry won't raise a `LookupError`. - -```python ->>> codext.remove("utf-8") ->>> codext.encode("test", "utf-8") -b'test' -``` - -Removing a macro works exactly the same way as for a codec. - -```python ->>> codext.remove("test-macro") -``` - ------ - -### Remove or restore `codext` encodings and macros - -It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. - -```python ->>> codext.clear() ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -```python ->>> codext.reset() ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' -``` - ------ - -### Multi-rounds encoding - -It is possible to use multiple times the same encoding through the following convention: `encoding[X]` - -A simple example for a 1-round and a 2-rounds morse-encoded string: - -```python ->>> codext.encode("This is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("This is a test", "morse[2]") -'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' -``` - -Another example using 5-rounds base58: - -```python ->>> codext.encode("Sup3rS3cr3t", "base58[5]") -'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' -``` - ------ - -### Hooked `codecs` functions - -In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. - -While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. - +Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. + +`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. + +!!! note "The `open` built-in function" + + Two behaviors are to be considered when using `codext`: + + 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. + 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. + + This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. + +!!! warning "Lossy conversion" + + Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. + +----- + +### Add a custom encoding + +New codecs can be added easily using the new function `add`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add in module codext.__common__: + +add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) + This adds a new codec to the codecs module setting its encode and/or decode + functions, eventually dynamically naming the encoding with a pattern and + with file handling (if text is True). + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + +``` + +Here is a simple example of how to add a basic codec: + +```python +import codext + +def mycodec_encode(text, errors="strict"): + # do some encoding stuff + return encoded, len(text) + +def mycodec_decode(text, errors="strict"): + # do some decoding stuff + return decoded, len(text) + +codext.add("mycodec", mycodec_encode, mycodec_decode) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. +- These functions always return a pair with the resulting string and the length of consumed input text. + +Another example for a more complex and dynamic codec: + +```python +import codext + +def mydyncodec_encode(i): + def encode(text, error="strict"): + # do somthing depending on i + return result, len(text) + return encode + +codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- Only the encoding function is defined. +- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. + +!!! warning "Pattern capture group" + + A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". + +----- + +### Add a custom map encoding + +New codecs using encoding maps can be added easily using the new function `add_map`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add_map in module codext.__common__: + +add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) + This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module + dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern + and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + +``` + +This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. + +Here is a simple example of how to add a map codec: + +```python +import codext + +ENCMAP = {'a': "A", 'b': "B", 'c': "C"} + +codext.add_map("mycodec", ENCMAP) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions do not have to be declared anymore. +- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. + +Another example for a more complex and dynamic codec: + +```python +import codext + +ENCMAP = [ + {'00': "A", '01': "B", '10': "C", '11': "D"}, + {'00': "D", '01': "C", '10': "B", '11': "A"}, +] + +codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. +- Instead of using the default character "`?`" for replacements, we use "`#`". +- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. +- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +----- + +### Add a macro + +**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. + +Here is an example of adding a macro (and verifying it was indeed added): + +```python +>>> codext.list_macros() +['example-macro'] +>>> codext.add_macro("test-macro", "gzip", "base64") +>>> codext.list_macros() +['example-macro', 'test-macro'] +``` + +!!! note "Removing a macro" + + As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. + + :::python + >>> codext.remove("test-macro") + + If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. + +----- + +### List codecs + +Codecs can be listed with the `list` function, either the whole codecs or only some categories. + +```python +>>> codext.list() +['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] +``` + +!!! note "Codecs categories" + + - `native`: the built-in codecs from the original `codecs` package + - `non-native`: this special category regroups all the categories mentioned hereafter + - `base`: baseX codecs (e.g. `base`, `base100`) + - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) + - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) + - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) + - `language`: language-related codecs (e.g. `morse`, `navajo`) + - `other`: uncategorized codecs (e.g. `letters`, `url`) + - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) + + Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. + +```python +>>> codext.list("binary") +['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] +>>> codext.list("language") +['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] +>>> codext.list("native") +['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] +``` + +!!! warning "Codecs listed, not encodings" + + Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). + +----- + +### Search for encodings + +Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. + +```python +>>> codext.search("baudot") +['baudot', 'baudot_spaced', 'baudot_tape'] +>>> codext.search("al") +['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] +>>> codext.search("white") +['whitespace', 'whitespace_after_before'] +``` + +Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). + +```python +>>> codext.examples("rot") +['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] +>>> codext.examples("dna") +['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] +>>> codext.examples("barbie", 5) +['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] +``` + +----- + +### Remove a custom encoding or macro + +New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. + +```python +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +>>> codext.remove("bin") +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +Trying to remove a codec that is in the native registry won't raise a `LookupError`. + +```python +>>> codext.remove("utf-8") +>>> codext.encode("test", "utf-8") +b'test' +``` + +Removing a macro works exactly the same way as for a codec. + +```python +>>> codext.remove("test-macro") +``` + +----- + +### Remove or restore `codext` encodings and macros + +It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. + +```python +>>> codext.clear() +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +```python +>>> codext.reset() +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +``` + +----- + +### Multi-rounds encoding + +It is possible to use multiple times the same encoding through the following convention: `encoding[X]` + +A simple example for a 1-round and a 2-rounds morse-encoded string: + +```python +>>> codext.encode("This is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("This is a test", "morse[2]") +'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' +``` + +Another example using 5-rounds base58: + +```python +>>> codext.encode("Sup3rS3cr3t", "base58[5]") +'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' +``` + +----- + +### Hooked `codecs` functions + +In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. + +While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. + diff --git a/docs/pages/guessing.md b/docs/pages/guessing.md index 9bac11c..5745918 100644 --- a/docs/pages/guessing.md +++ b/docs/pages/guessing.md @@ -1,172 +1,170 @@ -## Guess Mode - -For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. - ------ - -### Parameters - -BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. - -The following parameters are tunable: - -- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. -- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. -- `max_depth`: the maximum depth for the tree search ; by default 5. -- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). -- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. - -A simple example for a 1-stage base64-encoded string: - -```python ->>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") -{('base64',): 'This is a test'} -``` - -An example of a 2-stages base64- then base62-encoded string: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") -{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} -``` - -In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: - -!!! note "Default stop function" - - :::python - >>> codext.stopfunc.default.__name__ - '...' - - The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") -{('base62', 'base64'): 'This is a test'} -``` - -In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. - -```python ->>> codext.stopfunc._reload_lang("langdetect") ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -If we know the first encoding, we can set this in the `found` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) -('This is a test', ('base62', 'base64')) -``` - -If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -Another example of 2-stages encoded string: - -```python ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") -('this is a test', ('base64', 'morse')) ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) -('this is a test', ('base64', 'morse')) -``` - -When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. - -!!! warning "Computation time" - - Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. - ------ - -### Available Stop Functions - -A few stop functions are predefined in the `stopfunc` submodule. - -```python ->>> import codext ->>> dir(codext.stopfunc) -['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] -``` - -Currently, the following stop functions are provided: - -- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) -- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) -- `printables`: checks that every output character is in the set of printables -- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern -- `text`: checks for printables and an entropy less than 4.6 (empirically determined) - -A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: - -```python ->>> codext.guess("...", codext.stopfunc.text) -[...] ->>> codext.guess("...", [...], stop_func=codext.stopfunc.text) -[...] -``` - -When a string is given, it is automatically converted to a `regex` stop function. - -```python ->>> s = codext.encode("pattern testing", "leetspeak") ->>> s -'p4773rn 73571n9' ->>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") ->>> stop_func(s) -True ->>> codext.guess(s, stop_func) -[...] -``` - -Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. - -```python ->>> codext.stopfunc.flag("test string") -False ->>> codext.stopfunc.flag("test f1@9") -True ->>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") -True -``` - -The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). - ------ - -### Natural Language Detection - -As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. - -Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): - -- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* -- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* -- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* -- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* -- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* - -The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. - -While loaded, the default backend can be switched to another one by using the `_reload_lang` function: - -```python ->>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule ->>> codext.stopfunc._reload_lang() # this unloads any loaded backend -``` - -Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. - ------ - -### Ranking Heuristic - -!!! warning "Work in progress" - - This part is still in progress and shall be improved with better features and/or using machine learning. - +For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. + +----- + +### Parameters + +BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. + +The following parameters are tunable: + +- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. +- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. +- `max_depth`: the maximum depth for the tree search ; by default 5. +- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). +- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. + +A simple example for a 1-stage base64-encoded string: + +```python +>>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") +{('base64',): 'This is a test'} +``` + +An example of a 2-stages base64- then base62-encoded string: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") +{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} +``` + +In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: + +!!! note "Default stop function" + + :::python + >>> codext.stopfunc.default.__name__ + '...' + + The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") +{('base62', 'base64'): 'This is a test'} +``` + +In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. + +```python +>>> codext.stopfunc._reload_lang("langdetect") +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +If we know the first encoding, we can set this in the `found` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) +('This is a test', ('base62', 'base64')) +``` + +If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +Another example of 2-stages encoded string: + +```python +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") +('this is a test', ('base64', 'morse')) +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) +('this is a test', ('base64', 'morse')) +``` + +When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. + +!!! warning "Computation time" + + Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. + +----- + +### Available Stop Functions + +A few stop functions are predefined in the `stopfunc` submodule. + +```python +>>> import codext +>>> dir(codext.stopfunc) +['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] +``` + +Currently, the following stop functions are provided: + +- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) +- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) +- `printables`: checks that every output character is in the set of printables +- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern +- `text`: checks for printables and an entropy less than 4.6 (empirically determined) + +A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: + +```python +>>> codext.guess("...", codext.stopfunc.text) +[...] +>>> codext.guess("...", [...], stop_func=codext.stopfunc.text) +[...] +``` + +When a string is given, it is automatically converted to a `regex` stop function. + +```python +>>> s = codext.encode("pattern testing", "leetspeak") +>>> s +'p4773rn 73571n9' +>>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") +>>> stop_func(s) +True +>>> codext.guess(s, stop_func) +[...] +``` + +Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. + +```python +>>> codext.stopfunc.flag("test string") +False +>>> codext.stopfunc.flag("test f1@9") +True +>>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") +True +``` + +The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). + +----- + +### Natural Language Detection + +As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. + +Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): + +- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* +- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* +- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* +- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* +- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* + +The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. + +While loaded, the default backend can be switched to another one by using the `_reload_lang` function: + +```python +>>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule +>>> codext.stopfunc._reload_lang() # this unloads any loaded backend +``` + +Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. + +----- + +### Ranking Heuristic + +!!! warning "Work in progress" + + This part is still in progress and shall be improved with better features and/or using machine learning. + diff --git a/docs/pages/howto.md b/docs/pages/howto.md index 6163ef6..9e59805 100644 --- a/docs/pages/howto.md +++ b/docs/pages/howto.md @@ -1,242 +1,240 @@ -## How To Create Your Codec - -The purpose of this section is to provide a tutorial for creating new codecs accordingly. - -As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: - -1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. -2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. - -In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. - -!!! important "Codec precedence" - - `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. - -The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. - -!!! reminder "Contributions welcome !" - - Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! - ------ - -### Generic arguments - -Whatever solution is chosen, the following arguments shall be considered: - -- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. -- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". - -!!! danger "Too broad pattern" - - Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. - - >>> import codext - >>> identity = lambda text, errors="strict": (text, len(text)) - >>> codext.add("everything", identity, identity, pattern=r".*") - >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" - 'test string' - >>> codext.decode("test string", "test-encoding-name") - 'test string' - >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added - '- . ... - / ... - .-. .. -. --.' - >>> test = lambda text, errors="strict": ("TEST", len(t)) - >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" - >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected - 'test string' # gives the output of codec "test-encoding-name", - # which has precedence on "test" and a too broad pattern - ------ - -### Which `add` function ? - -At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. - -A few examples: - -- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) -- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions -- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name - -So, before going further, determine the following: - -- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. -- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. -- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. -- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. - -If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer -to [Case 2](#case-2-encoding-map). - ------ - -### Case 1: Generic encoding definition - -This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) - -The following shall be considered: - -- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. -- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. - -Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): - -- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. -- Outputs: encoded text and length of consumed input text. - -!!! note "Error handling mode" - - - `strict`: this is the default ; it means that any error shall raise an exception. - - `ignore`: any error is ignored, adding nothing to the output. - - `replace`: any error yields the given replacement character(s). - - `leave`: any error yields the erroneous input token in the output. - - This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. - -Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). - -```python ->>> help(codext.handle_error) -Help on function handle_error in module codext.__common__: - -handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') - This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - ->>> err = codext.handle_error("test", "strict") ->>> help(err) -Help on function _handle_error in module codext.__common__: - -_handle_error(token, position) - This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - -``` - ------ - -### Case 2: Encoding map - -This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) - -The following options shall be considered: - -- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. -- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. -- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. -- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". -- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. -- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. -- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - -`encmap` can be defined as follows: - -1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). -2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). -3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. -4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). - -!!! note "Mapping one input character to multiple output characters" - - In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). - ------ - -### Self-generated tests - -In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). - -A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: - -- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). -- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. -- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. - -The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. - -Examples of `__examples__` test suites: - -```python -__my_examples__ = { - 'enc(BAD)': None -} -``` - -!!! note "Observations" - - - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. - -```python -__examples__ = { - 'enc(codec)': {'string': None} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. - -```python -__examples__ = { - 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc-dec` is used, meaning that a list of inputs is defined. - - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. - - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). - -```python -__examples__ = { - 'enc(codec)': {"test string": "..."} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. - -```python -__examples__ = { - 'enc(codec)': {"Test String": "..."}, - 'dec(codec)': {"...": "test string"}, -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). - ------ - -### Adding a new codec to `codext` - -As a checklist when making a codec for addition in `codext`, please follow these steps: - -1. Create your codec file (i.e. starting with a copy of an existing similar one) -2. Place it into the right category folder -3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) -4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) -5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) - +The purpose of this section is to provide a tutorial for creating new codecs accordingly. + +As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: + +1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. +2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. + +In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. + +!!! important "Codec precedence" + + `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. + +The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. + +!!! reminder "Contributions welcome !" + + Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! + +----- + +### Generic arguments + +Whatever solution is chosen, the following arguments shall be considered: + +- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. +- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". + +!!! danger "Too broad pattern" + + Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. + + >>> import codext + >>> identity = lambda text, errors="strict": (text, len(text)) + >>> codext.add("everything", identity, identity, pattern=r".*") + >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" + 'test string' + >>> codext.decode("test string", "test-encoding-name") + 'test string' + >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added + '- . ... - / ... - .-. .. -. --.' + >>> test = lambda text, errors="strict": ("TEST", len(t)) + >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" + >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected + 'test string' # gives the output of codec "test-encoding-name", + # which has precedence on "test" and a too broad pattern + +----- + +### Which `add` function ? + +At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. + +A few examples: + +- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) +- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions +- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name + +So, before going further, determine the following: + +- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. +- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. +- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. +- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. + +If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer +to [Case 2](#case-2-encoding-map). + +----- + +### Case 1: Generic encoding definition + +This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) + +The following shall be considered: + +- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. +- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. + +Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): + +- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. +- Outputs: encoded text and length of consumed input text. + +!!! note "Error handling mode" + + - `strict`: this is the default ; it means that any error shall raise an exception. + - `ignore`: any error is ignored, adding nothing to the output. + - `replace`: any error yields the given replacement character(s). + - `leave`: any error yields the erroneous input token in the output. + + This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. + +Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). + +```python +>>> help(codext.handle_error) +Help on function handle_error in module codext.__common__: + +handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') + This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + +>>> err = codext.handle_error("test", "strict") +>>> help(err) +Help on function _handle_error in module codext.__common__: + +_handle_error(token, position) + This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + +``` + +----- + +### Case 2: Encoding map + +This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) + +The following options shall be considered: + +- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. +- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. +- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. +- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". +- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. +- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. +- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +`encmap` can be defined as follows: + +1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). +2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). +3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. +4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). + +!!! note "Mapping one input character to multiple output characters" + + In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). + +----- + +### Self-generated tests + +In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). + +A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: + +- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). +- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. +- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. + +The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. + +Examples of `__examples__` test suites: + +```python +__my_examples__ = { + 'enc(BAD)': None +} +``` + +!!! note "Observations" + + - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. + +```python +__examples__ = { + 'enc(codec)': {'string': None} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. + +```python +__examples__ = { + 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc-dec` is used, meaning that a list of inputs is defined. + - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. + - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). + +```python +__examples__ = { + 'enc(codec)': {"test string": "..."} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. + +```python +__examples__ = { + 'enc(codec)': {"Test String": "..."}, + 'dec(codec)': {"...": "test string"}, +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). + +----- + +### Adding a new codec to `codext` + +As a checklist when making a codec for addition in `codext`, please follow these steps: + +1. Create your codec file (i.e. starting with a copy of an existing similar one) +2. Place it into the right category folder +3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) +4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) +5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) + diff --git a/docs/pages/index.md b/docs/pages/index.md index 185dd25..2579b17 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -1,11 +1,9 @@ -## Introduction - -Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. - -### Setup - -This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: - -```sh -pip install codext -``` +Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. + +### Setup + +This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: + +```sh +pip install codext +``` diff --git a/docs/pages/manipulations.md b/docs/pages/manipulations.md index 8857ca7..340f89c 100644 --- a/docs/pages/manipulations.md +++ b/docs/pages/manipulations.md @@ -1,75 +1,74 @@ -## String tranformations - -`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. - ------ - -### Case-related operations - -These transformation functions are simple string transformations, including `str`'s methods. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`camelcase` | text --> camel-case text | `camel` | no decoding -`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text -`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` -`pascalcase` | text --> pascal-case text | `pascal` | no decoding -`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding -`snakecase` | text --> snake-case text | `snake` | no decoding -`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | -`title` | text <-> titled text | | decoding "untitles" the text -`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` - -Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -Some simple examples: - -```sh -$ echo -en "test string" | codext encode swap-case -TEST STRING - -$ echo -en "test string" | codext encode camel_case -testString - -$ echo -en "test string" | codext encode kebab_case -test-string -``` - ------ - -### Dummy string operations - -These transformation functions are simple string transformations. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ -`reverse` | text <-> reversed text | | -`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) -`strip-spaces` | text <-> all whitespaces stripped | | -`substitute` | text <-> text with token substituted | | -`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ - -As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -A simple example: - -```sh -$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ -string_test -``` - -Another example: - -```sh -$ echo -en "3132333435" | codext encode tokenize-2 -31 32 33 34 35 -``` - -Or using encodings chaining: - -```sh -$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase -phrase test -``` - +`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. + +----- + +### Case-related operations + +These transformation functions are simple string transformations, including `str`'s methods. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`camelcase` | text --> camel-case text | `camel` | no decoding +`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text +`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` +`pascalcase` | text --> pascal-case text | `pascal` | no decoding +`screamingsnakecase` | text --> screaming-snake-case text | `screaming-snake`, `screaming_snake_case` | no decoding +`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding +`snakecase` | text --> snake-case text | `snake` | no decoding +`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | +`title` | text <-> titled text | | decoding "untitles" the text +`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` + +Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +Some simple examples: + +```sh +$ echo -en "test string" | codext encode swap-case +TEST STRING + +$ echo -en "test string" | codext encode camel_case +testString + +$ echo -en "test string" | codext encode kebab_case +test-string +``` + +----- + +### Dummy string operations + +These transformation functions are simple string transformations. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ +`reverse` | text <-> reversed text | | +`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) +`strip-spaces` | text <-> all whitespaces stripped | | +`substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ + +As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +A simple example: + +```sh +$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ +string_test +``` + +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + +Or using encodings chaining: + +```sh +$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase +phrase test +``` + diff --git a/pyproject.toml b/pyproject.toml index 099d04b..b204596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,17 +16,13 @@ authors = [ description = "Native codecs extension" license = {file = "LICENSE"} keywords = ["python", "development", "programming", "codecs", "encodings"] -requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" +requires-python = ">=3.8,<4" classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -34,9 +30,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "markdown2==2.3.10; python_version=='2.7'", - "markdown2>=2.4.0; python_version>='3.6'", - "six", + "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..fcccae1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = src diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index a4cc557..d3fbbb2 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.14.2 +1.15.0 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index d88dcbe..a2ff0ef 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1,1517 +1,1520 @@ -# -*- coding: UTF-8 -*- -import _codecs -import codecs -import json -import os -import random -import re -import sys -from encodings.aliases import aliases as ALIASES -from functools import reduce, update_wrapper, wraps -from importlib import import_module -from inspect import currentframe -from itertools import chain, product -from locale import getlocale -from math import log -from pkgutil import iter_modules -from platform import system -from random import randint -from six import binary_type, string_types, text_type, BytesIO -from string import * -from types import FunctionType, ModuleType -try: # Python2 - import __builtin__ as builtins -except ImportError: - import builtins -try: # Python2 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec -try: # Python2 - from string import maketrans -except ImportError: - maketrans = str.maketrans -try: # Python3 - from importlib import reload -except ImportError: - pass -try: # from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore - re.sre_parse -except AttributeError: - import sre_parse as __sre_parse - re.sre_parse = __sre_parse - - -__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", - "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "i2s", "is_native", - "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", - "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] -CODECS_REGISTRY = None -CODECS_OVERWRITTEN = [] -CODECS_CATEGORIES = ["native", "custom"] -CODECS_CACHE = {} -LANG = getlocale() -if LANG: - LANG = (LANG[0] or "")[:2].lower() -MASKS = { - 'a': printable, - 'b': "".join(chr(i) for i in range(256)), - 'd': digits, - 'h': digits + "abcdef", - 'H': digits + "ABCDEF", - 'l': ascii_lowercase, - 'p': punctuation, - 's': " ", - 'u': ascii_uppercase, -} - -__codecs_registry = [] - -MACROS = {} -PERS_MACROS = {} -PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") - -DARWIN = system() == "Darwin" -LINUX = system() == "Linux" -PY3 = sys.version[0] == "3" -UNIX = DARWIN or LINUX -WINDOWS = system() == "Windows" - -entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) - -isb = lambda s: isinstance(s, binary_type) -iss = lambda s: isinstance(s, string_types) -fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x - -s2i = lambda s: int(codecs.encode(s, "base16"), 16) -exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) - - -def i2s(input): - h = hex(input)[2:].rstrip("eL") - return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") - - -class CodecMacro(tuple): - """Macro details when looking up the codec registry. """ - def __new__(cls, name): - self = tuple.__new__(cls) - self.name = name - # get from personal macros first - try: - self.codecs = PERS_MACROS[name] - except KeyError: - try: - self.codecs = MACROS[name] - except KeyError: - raise LookupError("unknown macro: %s" % name) - if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) - self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) - self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable - # test examples to check that the chain of encodings works - for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): - if re.match(r"enc(-dec)?\(", action): - for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - self.encode(s.lower() if rd.group(1) else s) - continue - self.encode(e) - - class Codec: - decode = self.decode - encode = self.encode - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return b(self.encode(input, self.errors)[0]) - self.incrementalencoder = IncrementalEncoder - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return ensure_str(self.decode(input, self.errors)[0]) - self.incrementaldecoder = IncrementalDecoder - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - self.streamwriter = StreamWriter - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - self.streamreader = StreamReader - - return self - - def decode(self, input, error="strict"): - """ Decode with each codec in reverse order. """ - for ci in self.codecs[::-1]: - input, l = ci.decode(input, error) - return input, l - - def encode(self, input, error="strict"): - """ Encode with each codec. """ - for ci in self.codecs: - input, l = ci.encode(input, error) - return input, l - - def __repr__(self): - return "" % (self.name, id(self)) - - -# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python -class Repr(object): - def __init__(self, name, func): - self.__name = name - self.__func = func - update_wrapper(self, func) - - def __call__(self, *args, **kwargs): - return self.__func(*args, **kwargs) - - def __repr__(self): - return "" % (self.__name, id(self)) - - -def __stdin_pipe(): - """ Stdin pipe read function. """ - try: - with open(0, 'rb') as f: - for l in f: - yield l - except TypeError: - for l in sys.stdin: - yield l - - -def _input(infile): - # handle input file or stdin - c = b("") - if infile: - with open(infile, 'rb') as f: - c = f.read() - else: - for line in __stdin_pipe(): - c += line - return c - - -def _set_exc(name, etype="ValueError"): - if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) - setattr(builtins, name, locals()[name]) -_set_exc("InputSizeLimitError") -_set_exc("ParameterError") - - -def _stripl(s, st_lines, st_crlf): - if st_crlf: - s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") - if st_lines: - s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") - return s - - -def _with_repr(name): - def _wrapper(f): - return Repr(name, f) - return _wrapper - - -def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): - """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically - naming the encoding with a pattern and with file handling. - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - remove(ename) - if encode: - if not isinstance(encode, FunctionType): - raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin - if decode: - if not isinstance(decode, FunctionType): - raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin - if not encode and not decode: - raise ValueError("At least one en/decoding function must be defined") - for exc in kwargs.get('extra_exceptions', []): - _set_exc(exc) # create additional custom exceptions as builtins - glob = currentframe().f_back.f_globals - # search function for the new encoding - @_with_repr(ename) - def getregentry(encoding): - if encoding != ename and not (pattern and re.match(pattern, encoding)): - return - fenc, fdec, name = encode, decode, encoding - # prepare CodecInfo input arguments - if pattern: - m, args, i = re.match(pattern, encoding), [], 1 - try: - while True: - try: - g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": - g = int(g) - args += [g] - i += 1 - except AttributeError: - # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match - if m is not None: - raise - return - except IndexError: - # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; - # in this case, if fenc/fdec is a decorated function, execute it with no arg - if len(args) == 0: - if fenc and len(getfullargspec(fenc).args) == 1: - fenc = fenc() - if fdec and len(getfullargspec(fdec).args) == 1: - fdec = fdec() - else: - fenc = fenc(*args) if fenc else fenc - fdec = fdec(*args) if fdec else fdec - if fenc: - fenc = fix_inout_formats(fenc) - if fdec: - fdec = fix_inout_formats(fdec) - sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) - if sl or sc: - def _striplines(f): - def __wrapper(input, *a, **kw): - return f(_stripl(input, sc, sl), *a, **kw) - return __wrapper - # this fixes issues with wrapped encoded inputs - fdec = _striplines(fdec) - - class Codec(codecs.Codec): - def encode(self, input, errors="strict"): - if fenc is None: - raise NotImplementedError - return fenc(input, errors) - - def decode(self, input, errors="strict"): - if fdec is None: - raise NotImplementedError - return fdec(input, errors) - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - if fenc is None: - raise NotImplementedError - return b(fenc(input, self.errors)[0]) - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - if fdec is None: - raise NotImplementedError - return ensure_str(fdec(input, self.errors)[0]) - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - - ci = codecs.CodecInfo( - name=name, - encode=Codec().encode, - decode=Codec().decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamwriter=StreamWriter, - streamreader=StreamReader, - _is_text_encoding=text, - ) - ci.parameters = kwargs - ci.parameters['name'] = ename - ci.parameters['add_to_codecs'] = add_to_codecs - ci.parameters['pattern'] = pattern - ci.parameters['text'] = text - f = glob.get('__file__', os.path.join("custom", "_")) - cat = f.split(os.path.sep)[-2].rstrip("s") - if cat not in CODECS_CATEGORIES: - CODECS_CATEGORIES.append(cat) - ci.parameters['category'] = kwargs.get('category', cat) - ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) - ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] - ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) - ci.parameters.setdefault("scoring", {}) - for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", - "padding_char", "transitive"]: - a = kwargs.pop(attr, None) - if a is not None: - ci.parameters['scoring'][attr] = a - return ci - - getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) - if kwargs.get('aliases'): - getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) - getregentry.__pattern__ = pattern - register(getregentry, add_to_codecs) - return getregentry - - -def add_macro(mname, *encodings): - """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of - macros from a YAML file embedded in the package and a local YAML file from the home folder that takes - precedence for defining personal macros. - - :param mname: macro name - :param encodings: encoding names of the encodings to be chained with the macro - """ - global PERS_MACROS - # check for name clash with alreday existing macros and codecs - if mname in MACROS or mname in PERS_MACROS: - raise ValueError("Macro name already exists") - try: - ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) - except LookupError: - pass - try: - PERS_MACROS[mname] = encodings - CodecMacro(mname) - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except ValueError: - del PERS_MACROS[mname] - raise -codecs.add_macro = add_macro - - -def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): - """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs - module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with - a pattern and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - outype = outype or intype - if ignore_case not in [None, "encode", "decode", "both"]: - raise ValueError("Bad ignore_case parameter while creating encoding map") - if intype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad input type parameter while creating encoding map") - if outype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad output type parameter while creating encoding map") - - def __generic_code(decode=False): - def _wrapper(param): - """ The parameter for wrapping comes from the encoding regex pattern ; e.g. - [no pattern] => param will be None everytime - r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 - r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") - - In order of precedence: - 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", - param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) - 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse - """ - p = param - if isinstance(encmap, FunctionType): - mapdict = encmap(p) - p = None - else: - mapdict = encmap - if isinstance(mapdict, dict): - smapdict = {k: v for k, v in mapdict.items()} - elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): - smapdict = {k: v for k, v in mapdict[0].items()} - else: - raise ValueError("Bad mapping dictionary or list of mapping dictionaries") - if p is not None: - # case 1: param is empty string - if p == "": - if isinstance(mapdict, list): - smapdict = {k: v for k, v in mapdict[0].items()} - elif isinstance(mapdict, dict): - if '' in mapdict.keys() and isinstance(mapdict[''], dict): - smapdict = {k: v for k, v in mapdict[''].items()} - else: - smapdict = {k: v for k, v in mapdict.items()} - # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block - # case 2: list or dictionary or dictionary of numbered encodings - elif isinstance(p, int): - # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) - if isinstance(mapdict, list): - p -= 1 - if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ - isinstance(mapdict, dict) and p in mapdict.keys(): - smapdict = {k: v for k, v in mapdict[p].items()} - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - # case 3: dictionary of regex-selected encoding mappings - elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): - tmp = None - for r, d in mapdict.items(): - if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence - continue # it must be excluded - if re.match(r, p): - tmp = d - break - if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - smapdict = tmp - # case 4: encoding characters translation - else: - # collect base tokens in order of appearance in the mapping dictionary - base_tokens = "" - for _, c in sorted(mapdict.items()): - for t in c: - for st in t: - if st not in base_tokens: - base_tokens += st - if " " not in sep: - base_tokens = base_tokens.replace(" ", "") - if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): - p = p[1:] - if len(p) == len(set(p)) == len(base_tokens): - t = maketrans(base_tokens, p) - for k, v in smapdict.items(): - smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - if ignore_case is not None: - cases = ["upper", "lower"] - case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] - case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] - i = ignore_case - smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ - ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ - if i in ["both", "decode"] else v for k, v in smapdict.items()} - if decode: - tmp = {} - # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; - # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) - for k, v in sorted(smapdict.items()): - if not isinstance(v, list): - v = [v] - for x in v: - if x not in tmp.keys(): - tmp[x] = k - smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) - kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs - # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop - if '' not in smapdict.keys(): - smapdict[''] = "" - # determine token and result lengths - tmaxlen = max(map(len, smapdict.keys())) - tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) - l = [] - for x in smapdict.values(): - getattr(l, ["append", "extend"][isinstance(x, list)])(x) - rminlen = max(1, min(map(len, set(l) - {''}))) - - # generic encoding/decoding function for map encodings - def code(text, errors="strict"): - icase = ignore_case == "both" or \ - decode and ignore_case == "decode" or \ - not decode and ignore_case == "encode" - if icase: - case = case_d if decode else case_e - if no_error: - errors = "leave" - text = ensure_str(text) - if not decode: - if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) - elif intype == "ord": - text = "".join(str(ord(c)).zfill(3) for c in text) - r = "" - lsep = "" if decode else sep if len(sep) <= 1 else sep[0] - kind = ["character", "token"][tmaxlen > 1] - error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) - - # get the value from the mapping dictionary, trying the token with its inverted case if relevant - def __get_value(token, position, case_changed=False): - try: - result = smapdict[token] - except KeyError: - if icase and not case_changed: - token_inv_case = getattr(token, case)() - return __get_value(token_inv_case, position, True) - return error_func(token, position) - if isinstance(result, list): - result = result[0] - return result + lsep - - # if a separator is defined, rely on it by splitting the input text - if decode and len(sep) > 0: - for i, c in enumerate(re.split("[" + sep + "]", text)): - r += __get_value(c, i) - # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex - # encodings with variable token lengths - else: - cursor, bad = 0, "" - while cursor < len(text): - token = text[cursor:cursor+1] - for l in range(tminlen, tmaxlen + 1): - token = text[cursor:cursor+l] - if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): - r += __get_value(token, cursor) - cursor += l - break - else: - # collect bad chars and only move the cursor one char to the right - bad += text[cursor] - cursor += 1 - # if the number of bad chars is the minimum token length, consume it and start a new buffer - if len(bad) == tminlen or errors == "leave": - posn = cursor - len(bad) - r += error_func(bad, posn) - bad = "" - if decode: - if outype in ["bin", "ord"]: - tmp, r = "", r.replace(lsep, "") - step = [3, 8][outype == "bin"] - for i in range(0, len(r), step): - s = r[i:i+step] - try: - tmp += chr(int(s, 2) if outype == "bin" else int(s)) - except ValueError: - if len(s) > 0: - tmp += "[" + s + "]" - r = tmp + lsep - return r[:len(r)-len(lsep)], len(b(text)) - return code - if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: - # in this case, there is no capturing group for parametrization - return _wrapper(None) - return _wrapper - - glob = currentframe().f_back.f_globals - kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") - kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) - kwargs['encmap'] = encmap - kwargs['repl_char'] = repl_char - kwargs['sep'] = sep - kwargs['ignore_case'] = ignore_case - kwargs['no_error'] = no_error - kwargs['intype'] = intype - kwargs['outype'] = outype - kwargs['module'] = glob.get('__name__') - try: - if isinstance(encmap, dict): - smapdict = {k: v for k, v in encmap.items()} - elif isinstance(encmap, list) and isinstance(encmap[0], dict): - smapdict = {k: v for k, v in encmap[0].items()} - kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) - kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) - except: - pass - return add(ename, __generic_code(), __generic_code(True), **kwargs) -codecs.add_map = add_map - - -def clear(): - """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS - __codecs_registry, MACROS, PERS_MACROS = [], {}, {} -codecs.clear = clear - - -def examples(encoding, number=10): - """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ - e = [] - for name in search(encoding): - for search_function in __codecs_registry: - n = search_function.__name__ - if name in [n, n.replace("_", "-")]: - temp = [] - for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): - temp.append(s) - random.shuffle(temp) - i = 0 - while i < min(number, len(temp)): - if not temp[i].isdigit(): - try: - lookup(temp[i], False) - e.append(temp[i]) - except LookupError: - pass - i += 1 - for alias, codec in ALIASES.items(): - if name == codec: - if codec not in e: - e.append(codec) - if not alias.isdigit(): - e.append(alias) - random.shuffle(e) - return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) -codecs.examples = examples - - -def is_native(encoding): - """ Determine if a given encoding is native or not. """ - return lookup(encoding, False).parameters['category'] == "native" - - -def list_categories(): - """ Get a list of all codec categories. """ - c = CODECS_CATEGORIES - root = os.path.dirname(__file__) - for d in os.listdir(root): - if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): - c.append(d.rstrip("s")) - # particular category, hardcoded from base/_base.py - c += ["base-generic"] - return c -list_categories() - - -def list_encodings(*categories): - """ Get a list of all codecs. """ - # if "non-native" is in the input list, extend the list with the whole categories but "native" - categories, exclude = list(categories), [] - for c in categories[:]: - if c == "non-native": - for c in CODECS_CATEGORIES: - if c == "native" or c in categories: - continue - categories.append(c) - categories.remove("non-native") - if c.startswith("~"): - exclude.append(c[1:]) - categories.remove(c) - try: - categories.remove(c[1:]) - except ValueError: - pass - # now, filter codecs according to the input list of categories - enc = [] - if (len(categories) == 0 or "native" in categories) and "native" not in exclude: - for a in set(ALIASES.values()): - try: - ci = __orig_lookup(a) - except LookupError: - continue - if lookup(a) is ci: - enc.append(ci.name) - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - name = search_function.__name__.replace("_", "-") - p = search_function.__pattern__ - ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) - c = "other" if ci is None else ci.parameters['category'] - if (len(categories) == 0 or c in categories) and c not in exclude: - enc.append(name) - for category in categories: - if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) - return sorted(list(set(enc)), key=_human_keys) - - -def list_macros(): - """ Get a list of all macros, with the precedence on personal ones. """ - return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) - - -def remove(name): - """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the - given name. """ - global __codecs_registry, MACROS, PERS_MACROS - tbr = [] - for search_function in __codecs_registry: - if search_function(name) is not None: - tbr.append(search_function) - for search_function in tbr: - __codecs_registry.remove(search_function) - try: - del MACROS[name] - except KeyError: - pass - try: - del PERS_MACROS[name] - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except KeyError: - pass - try: - del CODECS_CACHE[name] - except KeyError: - pass - for s in ["En", "De"]: - try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) - except AttributeError: - pass -codecs.remove = remove - - -def reset(): - """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS - clear() - d = os.path.dirname(__file__) - for pkg in sorted(os.listdir(d)): - if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): - continue - reload(import_module("codext." + pkg)) - # backup codext's registry - if CODECS_REGISTRY is None: - CODECS_REGISTRY = __codecs_registry[:] - # restore codext's registry - else: - __codecs_registry = CODECS_REGISTRY[:] - # restore codext's embedded set of macros - with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: - MACROS = json.load(f) - # reload personal set of macros - PERS_MACROS = {} - if os.path.exists(PERS_MACROS_FILE): - with open(PERS_MACROS_FILE) as f: - PERS_MACROS = json.load(f) -codecs.reset = reset - - -# conversion functions -def b(s): - """ Non-crashing bytes conversion function. """ - if PY3: - try: - return s.encode("latin-1") - except: - pass - try: - return s.encode("utf-8") - except: - pass - return s - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -# make conversion functions compatible with input/output strings/bytes -def fix_inout_formats(f): - """ This decorator ensures that the first output of f will have the same text format as the first input (str or - bytes). """ - @wraps(f) - def _wrapper(*args, **kwargs): - a0 = args[0] - a0_isb = isb(a0) - a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 - r = f(a0, *args[1:], **kwargs) - # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by - # the decode/encode function - if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: - r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) - return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) - return _wrapper - - -# alphabet generation function from a given mask -def get_alphabet_from_mask(mask): - """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are - marked with a heading "?". """ - i, alphabet = 0, "" - while i < len(mask): - c = mask[i] - if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): - for c in MASKS[mask[i+1]]: - if c not in alphabet: - alphabet += c - i += 1 - elif c not in alphabet: - alphabet += c - i += 1 - return alphabet - - -# generic error handling function -def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): - """ This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) - - def _handle_error(token, position, output="", eename=None): - """ This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - :param output: output, as decoded up to the position of the error - """ - if errors == "strict": - msg = "'%s' codec can't %scode %s '%s' in %s %d" - token = ensure_str(token) - token = token[:7] + "..." if len(token) > 10 else token - err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) - err.output = output - err.__cause__ = err - raise err - elif errors == "leave": - return token + sep - elif errors == "replace": - return repl_char * repl_minlen + sep - elif errors == "ignore": - return "" - else: - raise ValueError("Unsupported error handling '{}'".format(errors)) - return _handle_error - - -# codecs module hooks -__orig_lookup = _codecs.lookup -__orig_register = _codecs.register - - -def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): - kwargs.pop('add_to_codecs', None) - return add(ename, encode, decode, pattern, text, True, **kwargs) -__add.__doc__ = add.__doc__ -codecs.add = __add - - -def decode(obj, encoding='utf-8', errors='strict'): - """ Custom decode function relying on the hooked lookup function. """ - return lookup(encoding).decode(obj, errors)[0] -codecs.decode = decode - - -def encode(obj, encoding='utf-8', errors='strict'): - """ Custom encode function relying on the hooked lookup function. """ - n, m = 1, re.search(r"\[(\d+)\]$", encoding) - if m: - n = int(m.group(1)) - encoding = re.sub(r"\[(\d+)\]$", "", encoding) - ci = lookup(encoding) - for i in range(n): - obj = ci.encode(obj, errors)[0] - return obj -codecs.encode = encode - - -def lookup(encoding, macro=True): - """ Hooked lookup function for searching first for codecs in the local registry of this module. """ - # first, try to match the given encoding with codecs' search functions - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - codecinfo = search_function(encoding) - if codecinfo is not None: - return codecinfo - # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - if search_function.__name__.replace("_", "-") == encoding or \ - encoding in getattr(search_function, "__aliases__", []): - codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) - if codecinfo is not None: - return codecinfo - # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters - try: - ci = __orig_lookup(encoding) - ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} - return ci - except LookupError: - if not macro: - raise - try: - return CodecMacro(encoding) - except LookupError: - e = LookupError("unknown encoding: %s" % encoding) - e.__cause__ = e # stop exception chaining - raise e -codecs.lookup = lookup - - -def register(search_function, add_to_codecs=False): - """ Register function for registering new codecs in the local registry of this module and, if required, in the - native codecs registry (for use with the built-in 'open' function). - - :param search_function: search function for the codecs registry - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - if search_function not in __codecs_registry: - try: - __orig_lookup(search_function.__name__) - l = CODECS_OVERWRITTEN - except LookupError: - l = __codecs_registry - l.append(search_function) - if add_to_codecs: - __orig_register(search_function) - - -def __register(search_function): - """ Same as register(...), but with add_to_codecs set by default to True. """ - register(search_function, True) -codecs.register = __register - - -def search(encoding_regex, extended=True): - """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way - into the local registry but also tries a simple lookup with the original lookup function. """ - matches = [] - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - n = search_function.__name__ - for name in [n, n.replace("_", "-")]: - if re.search(encoding_regex, name): - matches.append(n.replace("_", "-")) - continue - if extended: - # in some cases, encoding_regex can match a generated string that uses a particular portion of its - # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also - # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly - # generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of - # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of - # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be - # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 - break - if c >= 3: - matches.append(n) - break - for s, n in ALIASES.items(): - if re.search(encoding_regex, s) or re.search(encoding_regex, n): - matches.append(n) - return sorted(list(set(matches)), key=_human_keys) -codecs.search = search - - -# utility function for the search feature -CATEGORIES = { - 'digit': digits, - 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), - 'space': whitespace, - 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), - 'word': ascii_letters + digits + '_', - 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), -} -REPEAT_MAX = 10 -STAR_PLUS_MAX = 10 -YIELD_MAX = 100 - - -def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): - """ Recursive function to generate strings from a regex pattern. """ - if regex is None: - return - __groups = {} - tokens = [] - negate, last_rand = False, None - for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): - code = getattr(state[0], "name", state[0]).lower() - value = getattr(state[1], "name", state[1]) - value = value.lower() if isinstance(value, str) else value - if code in ["assert_not", "at"]: - continue - elif code == "any": - charset = list(printable.replace("\n", "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ - elif code == "assert": - tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) - elif code == "branch": - result = [] - for r in value[1]: - result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] - tokens.append(result) - elif code == "category": - charset = list(CATEGORIES[value[9:]]) - if negate: - negate = False - charset = list(set(printable).difference(charset)) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "groupref": - tokens.extend(__groups[value]) - elif code == "in": - subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) - subtokens = [x for l in subtokens for x in l] - tokens.append(subtokens) - elif code == "literal": - tokens.append(chr(value)) - elif code in ["max_repeat", "min_repeat"]: - start, end = value[:2] - end = min(end, star_plus_max) - start = min(start, end) - charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - subtokens = [] - if start == 0 and end == 1: - subtokens.append("") - subtokens.extend(charset) - elif len(charset) ** end > repeat_max: - for i in range(min(repeat_max, 10 * len(charset))): - n = random.randint(start, end + 1) - token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) - if token not in subtokens: - subtokens.append(token) - else: - i -= 1 - else: - for n in range(start, end + 1): - for c in product(charset, repeat=n): - subtokens.append("".join(c)) - tokens.append(subtokens) - elif code == "negate": - negate = True - elif code == "not_literal": - charset = list(printable.replace(chr(value), "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "range": - tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) - elif code == "subpattern": - result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - if value[0]: - __groups[value[0]] = result - tokens.append(result) - else: - raise NotImplementedError("Unhandled code '{}'".format(code)) - if len(tokens) == 0: - tokens = [""] - i = 0 - for result in product(*tokens): - yield "".join(result) - i += 1 - if i >= yield_max: - break - - -def _human_keys(text): - """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ - tokens = [] - for s in re.split(r"(\d+|\D+)", text): - tokens.append(int(s) if s.isdigit() else s) - return tokens - - -def generate_string_from_regex(regex): - """ Utility function to generate a single string from a regex pattern. """ - if regex: - return list(generate_strings_from_regex(regex, yield_max=1))[0] - - -def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): - """ Utility function to generate strings from a regex pattern. """ - i = 0 - for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): - yield result - - -# guess feature objects -__module_exists = lambda n: n in [x[1] for x in iter_modules()] -stopfunc = ModuleType("stopfunc", """ - Predefined stop functions - ~~~~~~~~~~~~~~~~~~~~~~~~~ - - This submodule contains stop functions for the guess feature of codext. - - - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) - - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected - - `printables`: checks that every output character is in the set of printables - - `regex`: takes one argument, the regular expression, for checking a string against the given pattern - - `text`: checks for printables and an entropy less than 4.6 (empirically determined) -""") -stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) -stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" -stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None -stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" -stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 -stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" -stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None -stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" -stopfunc.default = stopfunc.text - -stopfunc.LANG_BACKEND = None -stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] -if len(stopfunc.LANG_BACKENDS) > 0: - stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] -if "cld3" in stopfunc.LANG_BACKENDS: - stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ - "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ - "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ - "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") -if "textblob" in stopfunc.LANG_BACKENDS: - stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ - "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ - "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") - - -def _detect(text): - _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) - if _lb is None: - raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) - return langid.classify(t)[0] if _lb == "langid" else \ - langdetect.detect(t) if _lb == "langdetect" else \ - pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ - cld3.get_language(t).language[:2] if _lb == "cld3" else \ - textblob.TextBlob(t).detect_language()[:2] - - -def _lang(lang): - def _test(s): - if not stopfunc.text(s): - return False - try: - return _detect(ensure_str(s))[:2] == lang - except: - return False - return _test - - -def _load_lang_backend(backend=None): - # import the requested backend library if not imported yet - if backend is None or backend in stopfunc.LANG_BACKENDS: - stopfunc.LANG_BACKEND = backend - if backend: - globals()[backend] = __import__(backend) - else: - raise ValueError("Unsupported language detection backend") - # remove language-related stop functions - for attr in dir(stopfunc): - if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): - continue - if re.match(r"lang_[a-z]{2}$", attr): - delattr(stopfunc, attr) - # rebind applicable language-related stop functions - if stopfunc.LANG_BACKEND: - _lb = stopfunc.LANG_BACKEND - if _lb == "langid": - langid.langid.load_model() - for lang in ( - langid.langid.identifier.nb_classes if _lb == "langid" else \ - list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ - list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ - stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ - stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ - []): - n = "lang_%s" % lang - setattr(stopfunc, n, _lang(lang)) - getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n - if LANG: - flng = "lang_%s" % LANG - if getattr(stopfunc, flng, None): - stopfunc.default = getattr(stopfunc, flng) -stopfunc._reload_lang = _load_lang_backend - - -def _validate(stop_function, lang_backend="none"): - s, lb = stop_function, lang_backend - if isinstance(s, string_types): - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - f = getattr(stopfunc, s, None) - if f: - return f - elif not isinstance(s, FunctionType): - raise ValueError("Bad stop function") - return s -stopfunc._validate = _validate - - -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): - """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ - if depth > min_depth and stop_func(input): - if not stop and (show or debug) and found not in result: - s = repr(input) - s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) - print(s if len(s) <= 80 else s[:77] + "...") - result[found] = input - if depth >= max_depth or len(result) > 0 and stop: - return - prev_enc = found[-1] if len(found) > 0 else "" - e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): - if len(result) > 0 and stop: - return - if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), - stop, show, scoring_heuristic, extended, debug) - - -def __make_encodings_dict(include, exclude): - """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible - encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - def _develop(d, keep=True): - d = d or {} - for k, v in d.items(): - l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] - # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): - g = [] - for e in (search(enc, False) or [enc]): - try: - ci = lookup(e, False) - g.extend(ci.parameters['guess']) - except: - pass - if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - l.extend(g) - d[k] = list(set(l)) - return d - _excl, _incl = _develop(exclude, False), _develop(include) - return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} - - -def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): - """ Filter valid encodings and rank them by relevance. """ - ranking = {} - for e in encodings: - try: - codec = CODECS_CACHE[e] - except KeyError: - try: - CODECS_CACHE[e] = codec = lookup(e, False) - except LookupError: - continue - t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) - if t: - ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): - yield result if yield_score else result[1], encoding - - -class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] - - def __init__(self, text, pad_char=None): - self.text = ensure_str(text) - c = self.text[-1] - pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) - self.padding = pad_char is not None and last_char == pad_char - if self.padding: - text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) - self.len = len(self.text) - self.lcharset = len(set(self.text)) - self.printables = float(len([c for c in self.text if c in printable])) / self.len - self.entropy = entropy(self.text) - - -def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): - """ Score relevant encodings given an input. """ - obj = None - sc = codec.parameters.get('scoring', {}) - no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = codec.decode(input)[0] - except: - return - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - return - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == codec.parameters['name']: - return - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', lambda e: e) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) - except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, codec, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - return s, new_input - - -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), - stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). - - :param input: input text to be guessed - :param stop_func: function defining the stop condition - :param min_depth: minimum search depth - :param max_depth: maximum search depth - ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means include every encoding) - :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means exclude no encoding) - :param found: tuple of already found encodings - :param stop: whether to stop or not when a valid solution is found - :param show: whether to immediately show once a solution is found - :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., - meaning that every non-failing encoding will be considered with no order of precedence) - :param extended: whether to also consider null scores with the heuristic - :param debug: whether to show each attempt at each depth during computation - """ - if len(input) == 0: - return "" - # check for min and max depths - if max_depth <= 0: - raise ValueError("Depth must be a non-null positive integer") - if min_depth > max_depth: - raise ValueError("Min depth shall be less than or equal to the max depth") - # take the tuple of found encodings into account - if len(found) > 0: - for encoding in found: - input = decode(input, encoding) - # handle the stop function as a regex if a string was given - if isinstance(stop_func, string_types): - stop_func = stopfunc.regex(stop_func) - # reformat include and exclude arguments ; supported formats: - for n, l in zip(["inc", "exc"], [include, exclude]): - if l is None: - if n == "inc": - include = l = {-1: CODECS_CATEGORIES} - else: - exclude = l = {} - # "category" OR "enc_name" OR whatever => means a single item for all depths - if isinstance(l, string_types): - if n == "inc": - include = l = {-1: [l]} - else: - exclude = l = {-1: [l]} - # ["enc_name1", "enc_name2", ...] => means for all depths - if isinstance(l, (list, tuple)): - if n == "inc": - include = l = {-1: l} - else: - exclude = l = {-1: l} - # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings - if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): - raise ValueError("Include argument shall be a list or a dictionary with integer keys") - # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, result = __make_encodings_dict(include, exclude), {} - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, - scoring_heuristic, extended, debug) - if stop and len(result) > 0: - break - except KeyboardInterrupt: - pass - CODECS_CACHE = {} - return result -codecs.guess = guess - - -def rank(input, extended=False, limit=-1, include=None, exclude=None): - """ Rank the most probable encodings based on the given input. - - :param input: input text to be evaluated - :param extended: whether to consider null scores too (NB: negative scores are not output !) - :param limit: number of encodings to be returned (-1 means all of them) - :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) - :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) - """ - encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, - exclude if isinstance(exclude, dict) else {-1: exclude or []}) - r = list(__rank(None, input, "", encodings[-1], True, extended, True)) - return r[:limit] if len(r) > 1 else r -codecs.rank = rank - +# -*- coding: UTF-8 -*- +import _codecs +import codecs +import hashlib +import json +import os +import random +import re +import sre_parse +import sys +from encodings.aliases import aliases as ALIASES +from functools import reduce, update_wrapper, wraps +from importlib import import_module +from inspect import currentframe +from io import BytesIO +from itertools import chain, product +from locale import getlocale +from math import log +from pkgutil import iter_modules +from platform import system +from random import randint +from string import * +from types import FunctionType, ModuleType +try: # Python2 + import __builtin__ as builtins +except ImportError: + import builtins +try: # Python2 + from inspect import getfullargspec +except ImportError: + from inspect import getargspec as getfullargspec +try: # Python2 + from string import maketrans +except ImportError: + maketrans = str.maketrans +try: # Python3 + from importlib import reload +except ImportError: + pass + +# from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore +re.sre_parse = sre_parse + + +__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", + "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s", + "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", + "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", + "DARWIN", "LANG", "LINUX", "MASKS", "UNIX", "WINDOWS"] +CODECS_REGISTRY = None +CODECS_OVERWRITTEN = [] +CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} +LANG = getlocale() +if LANG: + LANG = (LANG[0] or "")[:2].lower() +MASKS = { + 'a': printable, + 'b': "".join(chr(i) for i in range(256)), + 'd': digits, + 'h': digits + "abcdef", + 'H': digits + "ABCDEF", + 'l': ascii_lowercase, + 'p': punctuation, + 's': " ", + 'u': ascii_uppercase, +} + +__codecs_registry = [] + +MACROS = {} +PERS_MACROS = {} +PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") + +DARWIN = system() == "Darwin" +LINUX = system() == "Linux" +UNIX = DARWIN or LINUX +WINDOWS = system() == "Windows" + +entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) + +isb = lambda s: isinstance(s, bytes) +iss = lambda s: isinstance(s, str) +fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x + +s2i = lambda s: int(codecs.encode(s, "base16"), 16) +exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) + + +def i2s(input): + h = hex(input)[2:].rstrip("eL") + return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") + + +class CodecMacro(tuple): + """Macro details when looking up the codec registry. """ + def __new__(cls, name): + self = tuple.__new__(cls) + self.name = name + # get from personal macros first + try: + self.codecs = PERS_MACROS[name] + except KeyError: + try: + self.codecs = MACROS[name] + except KeyError: + raise LookupError("unknown macro: %s" % name) + if not isinstance(self.codecs, (tuple, list)): + raise ValueError("bad macro list: %s" % str(self.codecs)) + self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) + self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable + # test examples to check that the chain of encodings works + for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): + if re.match(r"enc(-dec)?\(", action): + for e in (examples.keys() if action.startswith("enc(") else examples or []): + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) + continue + self.encode(e) + + class Codec: + decode = self.decode + encode = self.encode + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return b(self.encode(input, self.errors)[0]) + self.incrementalencoder = IncrementalEncoder + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return ensure_str(self.decode(input, self.errors)[0]) + self.incrementaldecoder = IncrementalDecoder + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + self.streamwriter = StreamWriter + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + self.streamreader = StreamReader + + return self + + def decode(self, input, error="strict"): + """ Decode with each codec in reverse order. """ + for ci in self.codecs[::-1]: + input, l = ci.decode(input, error) + return input, l + + def encode(self, input, error="strict"): + """ Encode with each codec. """ + for ci in self.codecs: + input, l = ci.encode(input, error) + return input, l + + def __repr__(self): + return "" % (self.name, id(self)) + + +# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python +class Repr(object): + def __init__(self, name, func): + self.__name = name + self.__func = func + update_wrapper(self, func) + + def __call__(self, *args, **kwargs): + return self.__func(*args, **kwargs) + + def __repr__(self): + return "" % (self.__name, id(self)) + + +def __stdin_pipe(): + """ Stdin pipe read function. """ + try: + with open(0, 'rb') as f: + for l in f: + yield l + except TypeError: + for l in sys.stdin: + yield l + + +def _input(infile): + # handle input file or stdin + c = b("") + if infile: + with open(infile, 'rb') as f: + c = f.read() + else: + for line in __stdin_pipe(): + c += line + return c + + +def _set_exc(name, etype="ValueError"): + if not hasattr(builtins, name): + exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + setattr(builtins, name, locals()[name]) +_set_exc("InputSizeLimitError") +_set_exc("ParameterError") + + +def _stripl(s, st_lines, st_crlf): + if st_crlf: + s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") + if st_lines: + s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") + return s + + +def _with_repr(name): + def _wrapper(f): + return Repr(name, f) + return _wrapper + + +def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): + """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically + naming the encoding with a pattern and with file handling. + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + remove(ename) + if encode: + if not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + if decode: + if not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") + _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin + if not encode and not decode: + raise ValueError("At least one en/decoding function must be defined") + for exc in kwargs.get('extra_exceptions', []): + _set_exc(exc) # create additional custom exceptions as builtins + glob = currentframe().f_back.f_globals + # search function for the new encoding + @_with_repr(ename) + def getregentry(encoding): + if encoding != ename and not (pattern and re.match(pattern, encoding)): + return + fenc, fdec, name = encode, decode, encoding + # prepare CodecInfo input arguments + if pattern: + m, args, i = re.match(pattern, encoding), [], 1 + try: + while True: + try: + g = m.group(i) or "" + if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": + g = int(g) + args += [g] + i += 1 + except AttributeError: + # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match + if m is not None: + raise + return + except IndexError: + # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; + # in this case, if fenc/fdec is a decorated function, execute it with no arg + if len(args) == 0: + if fenc and len(getfullargspec(fenc).args) == 1: + fenc = fenc() + if fdec and len(getfullargspec(fdec).args) == 1: + fdec = fdec() + else: + fenc = fenc(*args) if fenc else fenc + fdec = fdec(*args) if fdec else fdec + if fenc: + fenc = fix_inout_formats(fenc) + if fdec: + fdec = fix_inout_formats(fdec) + sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) + if sl or sc: + def _striplines(f): + def __wrapper(input, *a, **kw): + return f(_stripl(input, sc, sl), *a, **kw) + return __wrapper + # this fixes issues with wrapped encoded inputs + fdec = _striplines(fdec) + + class Codec(codecs.Codec): + def encode(self, input, errors="strict"): + if fenc is None: + raise NotImplementedError + return fenc(input, errors) + + def decode(self, input, errors="strict"): + if fdec is None: + raise NotImplementedError + return fdec(input, errors) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + if fenc is None: + raise NotImplementedError + return b(fenc(input, self.errors)[0]) + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + if fdec is None: + raise NotImplementedError + return ensure_str(fdec(input, self.errors)[0]) + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + + ci = codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + _is_text_encoding=text, + ) + ci.parameters = kwargs + ci.parameters['name'] = ename + ci.parameters['add_to_codecs'] = add_to_codecs + ci.parameters['pattern'] = pattern + ci.parameters['text'] = text + f = glob.get('__file__', os.path.join("custom", "_")) + cat = f.split(os.path.sep)[-2].rstrip("s") + if cat not in CODECS_CATEGORIES: + CODECS_CATEGORIES.append(cat) + ci.parameters['category'] = kwargs.get('category', cat) + ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) + ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] + ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) + ci.parameters.setdefault("scoring", {}) + for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", + "padding_char", "transitive"]: + a = kwargs.pop(attr, None) + if a is not None: + ci.parameters['scoring'][attr] = a + return ci + + getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) + if kwargs.get('aliases'): + getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) + getregentry.__pattern__ = pattern + register(getregentry, add_to_codecs) + return getregentry + + +def add_macro(mname, *encodings): + """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of + macros from a YAML file embedded in the package and a local YAML file from the home folder that takes + precedence for defining personal macros. + + :param mname: macro name + :param encodings: encoding names of the encodings to be chained with the macro + """ + global PERS_MACROS + # check for name clash with alreday existing macros and codecs + if mname in MACROS or mname in PERS_MACROS: + raise ValueError("Macro name already exists") + try: + ci = lookup(mname, False) + raise ValueError("Macro name clashes with codec '%s'" % ci.name) + except LookupError: + pass + try: + PERS_MACROS[mname] = encodings + CodecMacro(mname) + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except ValueError: + del PERS_MACROS[mname] + raise +codecs.add_macro = add_macro + + +def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): + """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs + module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with + a pattern and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + outype = outype or intype + if ignore_case not in [None, "encode", "decode", "both"]: + raise ValueError("Bad ignore_case parameter while creating encoding map") + if intype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad input type parameter while creating encoding map") + if outype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad output type parameter while creating encoding map") + + def __generic_code(decode=False): + def _wrapper(param): + """ The parameter for wrapping comes from the encoding regex pattern ; e.g. + [no pattern] => param will be None everytime + r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 + r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") + + In order of precedence: + 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", + param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) + 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse + """ + p = param + if isinstance(encmap, FunctionType): + mapdict = encmap(p) + p = None + else: + mapdict = encmap + if isinstance(mapdict, dict): + smapdict = {k: v for k, v in mapdict.items()} + elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): + smapdict = {k: v for k, v in mapdict[0].items()} + else: + raise ValueError("Bad mapping dictionary or list of mapping dictionaries") + if p is not None: + # case 1: param is empty string + if p == "": + if isinstance(mapdict, list): + smapdict = {k: v for k, v in mapdict[0].items()} + elif isinstance(mapdict, dict): + if '' in mapdict.keys() and isinstance(mapdict[''], dict): + smapdict = {k: v for k, v in mapdict[''].items()} + else: + smapdict = {k: v for k, v in mapdict.items()} + # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block + # case 2: list or dictionary or dictionary of numbered encodings + elif isinstance(p, int): + # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) + if isinstance(mapdict, list): + p -= 1 + if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ + isinstance(mapdict, dict) and p in mapdict.keys(): + smapdict = {k: v for k, v in mapdict[p].items()} + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + # case 3: dictionary of regex-selected encoding mappings + elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): + tmp = None + for r, d in mapdict.items(): + if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence + continue # it must be excluded + if re.match(r, p): + tmp = d + break + if tmp is None: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + smapdict = tmp + # case 4: encoding characters translation + else: + # collect base tokens in order of appearance in the mapping dictionary + base_tokens = "" + for _, c in sorted(mapdict.items()): + for t in c: + for st in t: + if st not in base_tokens: + base_tokens += st + if " " not in sep: + base_tokens = base_tokens.replace(" ", "") + if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): + p = p[1:] + if len(p) == len(set(p)) == len(base_tokens): + t = maketrans(base_tokens, p) + for k, v in smapdict.items(): + smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + if ignore_case is not None: + cases = ["upper", "lower"] + case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] + case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] + i = ignore_case + smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ + ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ + if i in ["both", "decode"] else v for k, v in smapdict.items()} + if decode: + tmp = {} + # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; + # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) + for k, v in sorted(smapdict.items()): + if not isinstance(v, list): + v = [v] + for x in v: + if x not in tmp.keys(): + tmp[x] = k + smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) + kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs + # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop + if '' not in smapdict.keys(): + smapdict[''] = "" + # determine token and result lengths + tmaxlen = max(map(len, smapdict.keys())) + tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) + l = [] + for x in smapdict.values(): + getattr(l, ["append", "extend"][isinstance(x, list)])(x) + rminlen = max(1, min(map(len, set(l) - {''}))) + + # generic encoding/decoding function for map encodings + def code(text, errors="strict"): + icase = ignore_case == "both" or \ + decode and ignore_case == "decode" or \ + not decode and ignore_case == "encode" + if icase: + case = case_d if decode else case_e + if no_error: + errors = "leave" + text = ensure_str(text) + if not decode: + if intype == "bin": + text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) + elif intype == "ord": + text = "".join(str(ord(c)).zfill(3) for c in text) + r = "" + lsep = "" if decode else sep if len(sep) <= 1 else sep[0] + kind = ["character", "token"][tmaxlen > 1] + error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) + + # get the value from the mapping dictionary, trying the token with its inverted case if relevant + def __get_value(token, position, case_changed=False): + try: + result = smapdict[token] + except KeyError: + if icase and not case_changed: + token_inv_case = getattr(token, case)() + return __get_value(token_inv_case, position, True) + return error_func(token, position) + if isinstance(result, list): + result = result[0] + return result + lsep + + # if a separator is defined, rely on it by splitting the input text + if decode and len(sep) > 0: + for i, c in enumerate(re.split("[" + sep + "]", text)): + r += __get_value(c, i) + # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex + # encodings with variable token lengths + else: + cursor, bad = 0, "" + while cursor < len(text): + token = text[cursor:cursor+1] + for l in range(tminlen, tmaxlen + 1): + token = text[cursor:cursor+l] + if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): + r += __get_value(token, cursor) + cursor += l + break + else: + # collect bad chars and only move the cursor one char to the right + bad += text[cursor] + cursor += 1 + # if the number of bad chars is the minimum token length, consume it and start a new buffer + if len(bad) == tminlen or errors == "leave": + posn = cursor - len(bad) + r += error_func(bad, posn) + bad = "" + if decode: + if outype in ["bin", "ord"]: + tmp, r = "", r.replace(lsep, "") + step = [3, 8][outype == "bin"] + for i in range(0, len(r), step): + s = r[i:i+step] + try: + tmp += chr(int(s, 2) if outype == "bin" else int(s)) + except ValueError: + if len(s) > 0: + tmp += "[" + s + "]" + r = tmp + lsep + return r[:len(r)-len(lsep)], len(b(text)) + return code + if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: + # in this case, there is no capturing group for parametrization + return _wrapper(None) + return _wrapper + + glob = currentframe().f_back.f_globals + kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") + kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) + kwargs['encmap'] = encmap + kwargs['repl_char'] = repl_char + kwargs['sep'] = sep + kwargs['ignore_case'] = ignore_case + kwargs['no_error'] = no_error + kwargs['intype'] = intype + kwargs['outype'] = outype + kwargs['module'] = glob.get('__name__') + try: + if isinstance(encmap, dict): + smapdict = {k: v for k, v in encmap.items()} + elif isinstance(encmap, list) and isinstance(encmap[0], dict): + smapdict = {k: v for k, v in encmap[0].items()} + kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) + kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) + except: + pass + return add(ename, __generic_code(), __generic_code(True), **kwargs) +codecs.add_map = add_map + + +def clear(): + """ Clear codext's local registry of search functions. """ + global __codecs_registry, MACROS, PERS_MACROS + __codecs_registry, MACROS, PERS_MACROS = [], {}, {} +codecs.clear = clear + + +def examples(encoding, number=10): + """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ + e = [] + for name in search(encoding): + for search_function in __codecs_registry: + n = search_function.__name__ + if name in [n, n.replace("_", "-")]: + temp = [] + for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): + temp.append(s) + random.shuffle(temp) + i = 0 + while i < min(number, len(temp)): + if not temp[i].isdigit(): + try: + lookup(temp[i], False) + e.append(temp[i]) + except LookupError: + pass + i += 1 + for alias, codec in ALIASES.items(): + if name == codec: + if codec not in e: + e.append(codec) + if not alias.isdigit(): + e.append(alias) + random.shuffle(e) + return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) +codecs.examples = examples + + +def is_native(encoding): + """ Determine if a given encoding is native or not. """ + return lookup(encoding, False).parameters['category'] == "native" + + +def list_categories(): + """ Get a list of all codec categories. """ + c = CODECS_CATEGORIES + root = os.path.dirname(__file__) + for d in os.listdir(root): + if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): + c.append(d.rstrip("s")) + # particular category, hardcoded from base/_base.py + c += ["base-generic"] + return c +list_categories() + + +def list_encodings(*categories): + """ Get a list of all codecs. """ + # if "non-native" is in the input list, extend the list with the whole categories but "native" + categories, exclude = list(categories), [] + for c in categories[:]: + if c == "non-native": + for c in CODECS_CATEGORIES: + if c == "native" or c in categories: + continue + categories.append(c) + categories.remove("non-native") + if c.startswith("~"): + exclude.append(c[1:]) + categories.remove(c) + try: + categories.remove(c[1:]) + except ValueError: + pass + # now, filter codecs according to the input list of categories + enc = [] + if (len(categories) == 0 or "native" in categories) and "native" not in exclude: + for a in set(ALIASES.values()): + try: + ci = __orig_lookup(a) + except LookupError: + continue + if lookup(a) is ci: + enc.append(ci.name) + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + name = search_function.__name__.replace("_", "-") + p = search_function.__pattern__ + ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) + c = "other" if ci is None else ci.parameters['category'] + if (len(categories) == 0 or c in categories) and c not in exclude: + enc.append(name) + for category in categories: + if category not in CODECS_CATEGORIES: + raise ValueError("Category '%s' does not exist" % category) + return sorted(list(set(enc)), key=_human_keys) + + +def list_macros(): + """ Get a list of all macros, with the precedence on personal ones. """ + return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) + + +def remove(name): + """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the + given name. """ + global __codecs_registry, MACROS, PERS_MACROS + tbr = [] + for search_function in __codecs_registry: + if search_function(name) is not None: + tbr.append(search_function) + for search_function in tbr: + __codecs_registry.remove(search_function) + try: + del MACROS[name] + except KeyError: + pass + try: + del PERS_MACROS[name] + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except KeyError: + pass + try: + del CODECS_CACHE[name] + except KeyError: + pass + for s in ["En", "De"]: + try: + delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + except AttributeError: + pass +codecs.remove = remove + + +def reset(): + """ Reset codext's local registry of search functions and macros. """ + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS + clear() + d = os.path.dirname(__file__) + for pkg in sorted(os.listdir(d)): + if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): + continue + reload(import_module("codext." + pkg)) + # backup codext's registry + if CODECS_REGISTRY is None: + CODECS_REGISTRY = __codecs_registry[:] + # restore codext's registry + else: + __codecs_registry = CODECS_REGISTRY[:] + # restore codext's embedded set of macros + with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: + MACROS = json.load(f) + # reload personal set of macros + PERS_MACROS = {} + if os.path.exists(PERS_MACROS_FILE): + with open(PERS_MACROS_FILE) as f: + PERS_MACROS = json.load(f) +codecs.reset = reset + + +# conversion functions +def b(s): + """ Non-crashing bytes conversion function. """ + try: + return s.encode("latin-1") + except: + pass + try: + return s.encode("utf-8") + except: + pass + return s + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """ Dummy str conversion function. """ + if isinstance(s, bytes): + try: + return s.decode(encoding, errors) + except: + return s.decode("latin-1") + return s + + +# make conversion functions compatible with input/output strings/bytes +def fix_inout_formats(f): + """ This decorator ensures that the first output of f will have the same text format as the first input (str or + bytes). """ + @wraps(f) + def _wrapper(*args, **kwargs): + a0 = args[0] + a0_isb = isb(a0) + a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 + r = f(a0, *args[1:], **kwargs) + # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by + # the decode/encode function + if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: + r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) + return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) + return _wrapper + + +# alphabet generation function from a given mask +def get_alphabet_from_mask(mask): + """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are + marked with a heading "?". """ + i, alphabet = 0, "" + while i < len(mask): + c = mask[i] + if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): + for c in MASKS[mask[i+1]]: + if c not in alphabet: + alphabet += c + i += 1 + elif c not in alphabet: + alphabet += c + i += 1 + return alphabet + + +# generic error handling function +def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): + """ This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + """ + exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + + def _handle_error(token, position, output="", eename=None): + """ This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + :param output: output, as decoded up to the position of the error + """ + if errors == "strict": + msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = ensure_str(token) + token = token[:7] + "..." if len(token) > 10 else token + err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) + err.output = output + err.__cause__ = err + raise err + elif errors == "leave": + return token + sep + elif errors == "replace": + return repl_char * repl_minlen + sep + elif errors == "ignore": + return "" + else: + raise ValueError("Unsupported error handling '{}'".format(errors)) + return _handle_error + + +# codecs module hooks +__orig_lookup = _codecs.lookup +__orig_register = _codecs.register + + +def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): + kwargs.pop('add_to_codecs', None) + return add(ename, encode, decode, pattern, text, True, **kwargs) +__add.__doc__ = add.__doc__ +codecs.add = __add + + +def decode(obj, encoding='utf-8', errors='strict'): + """ Custom decode function relying on the hooked lookup function. """ + return lookup(encoding).decode(obj, errors)[0] +codecs.decode = decode + + +def encode(obj, encoding='utf-8', errors='strict'): + """ Custom encode function relying on the hooked lookup function. """ + n, m = 1, re.search(r"\[(\d+)\]$", encoding) + if m: + n = int(m.group(1)) + encoding = re.sub(r"\[(\d+)\]$", "", encoding) + ci = lookup(encoding) + for i in range(n): + try: + obj = ci.encode(obj, errors)[0] + except (AttributeError, TypeError) as e: # occurs for encodings that require str as input while 'obj' is bytes + if str(e) not in ["'bytes' object has no attribute 'encode'", + "ord() expected string of length 1, but int found"] or \ + encoding in ["latin-1", "utf-8"]: # encodings considered when using b(...) + raise + obj = ci.encode(ensure_str(obj), errors)[0] + return obj +codecs.encode = encode + + +def lookup(encoding, macro=True): + """ Hooked lookup function for searching first for codecs in the local registry of this module. """ + # first, try to match the given encoding with codecs' search functions + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + codecinfo = search_function(encoding) + if codecinfo is not None: + return codecinfo + # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + if search_function.__name__.replace("_", "-") == encoding or \ + encoding in getattr(search_function, "__aliases__", []): + codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) + if codecinfo is not None: + return codecinfo + # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters + try: + ci = __orig_lookup(encoding) + ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} + return ci + except LookupError: + if not macro: + raise + try: + return CodecMacro(encoding) + except LookupError: + e = LookupError("unknown encoding: %s" % encoding) + e.__cause__ = e # stop exception chaining + raise e +codecs.lookup = lookup + + +def register(search_function, add_to_codecs=False): + """ Register function for registering new codecs in the local registry of this module and, if required, in the + native codecs registry (for use with the built-in 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + if search_function not in __codecs_registry: + try: + __orig_lookup(search_function.__name__) + l = CODECS_OVERWRITTEN + except LookupError: + l = __codecs_registry + l.append(search_function) + if add_to_codecs: + __orig_register(search_function) + + +def __register(search_function): + """ Same as register(...), but with add_to_codecs set by default to True. """ + register(search_function, True) +codecs.register = __register + + +def search(encoding_regex, extended=True): + """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way + into the local registry but also tries a simple lookup with the original lookup function. """ + matches = [] + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + n = search_function.__name__ + for name in [n, n.replace("_", "-")]: + if re.search(encoding_regex, name): + matches.append(n.replace("_", "-")) + continue + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) + break + for s, n in ALIASES.items(): + if re.search(encoding_regex, s) or re.search(encoding_regex, n): + matches.append(n) + return sorted(list(set(matches)), key=_human_keys) +codecs.search = search + + +# utility function for the search feature +CATEGORIES = { + 'digit': digits, + 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), + 'space': whitespace, + 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), + 'word': ascii_letters + digits + '_', + 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), +} +REPEAT_MAX = 10 +STAR_PLUS_MAX = 10 +YIELD_MAX = 100 + + +def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): + """ Recursive function to generate strings from a regex pattern. """ + if regex is None: + return + __groups = {} + tokens = [] + negate, last_rand = False, None + for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): + code = getattr(state[0], "name", state[0]).lower() + value = getattr(state[1], "name", state[1]) + value = value.lower() if isinstance(value, str) else value + if code in ["assert_not", "at"]: + continue + elif code == "any": + charset = list(printable.replace("\n", "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ + elif code == "assert": + tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) + elif code == "branch": + result = [] + for r in value[1]: + result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] + tokens.append(result) + elif code == "category": + charset = list(CATEGORIES[value[9:]]) + if negate: + negate = False + charset = list(set(printable).difference(charset)) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "groupref": + tokens.extend(__groups[value]) + elif code == "in": + subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) + subtokens = [x for l in subtokens for x in l] + tokens.append(subtokens) + elif code == "literal": + tokens.append(chr(value)) + elif code in ["max_repeat", "min_repeat"]: + start, end = value[:2] + end = min(end, star_plus_max) + start = min(start, end) + charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + subtokens = [] + if start == 0 and end == 1: + subtokens.append("") + subtokens.extend(charset) + elif len(charset) ** end > repeat_max: + for i in range(min(repeat_max, 10 * len(charset))): + n = random.randint(start, end + 1) + token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) + if token not in subtokens: + subtokens.append(token) + else: + i -= 1 + else: + for n in range(start, end + 1): + for c in product(charset, repeat=n): + subtokens.append("".join(c)) + tokens.append(subtokens) + elif code == "negate": + negate = True + elif code == "not_literal": + charset = list(printable.replace(chr(value), "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "range": + tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) + elif code == "subpattern": + result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + if value[0]: + __groups[value[0]] = result + tokens.append(result) + else: + raise NotImplementedError("Unhandled code '{}'".format(code)) + if len(tokens) == 0: + tokens = [""] + i = 0 + for result in product(*tokens): + yield "".join(result) + i += 1 + if i >= yield_max: + break + + +def _human_keys(text): + """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ + tokens = [] + for s in re.split(r"(\d+|\D+)", text): + tokens.append(int(s) if s.isdigit() else s) + return tokens + + +def generate_string_from_regex(regex): + """ Utility function to generate a single string from a regex pattern. """ + if regex: + return list(generate_strings_from_regex(regex, yield_max=1))[0] + + +def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): + """ Utility function to generate strings from a regex pattern. """ + i = 0 + for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): + yield result + + +# guess feature objects +__module_exists = lambda n: n in [x[1] for x in iter_modules()] +stopfunc = ModuleType("stopfunc", """ + Predefined stop functions + ~~~~~~~~~~~~~~~~~~~~~~~~~ + + This submodule contains stop functions for the guess feature of codext. + + - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) + - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected + - `printables`: checks that every output character is in the set of printables + - `regex`: takes one argument, the regular expression, for checking a string against the given pattern + - `text`: checks for printables and an entropy less than 4.6 (empirically determined) +""") +stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) +stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" +stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None +stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" +stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 +stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" +stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None +stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" +stopfunc.default = stopfunc.text + +stopfunc.LANG_BACKEND = None +stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] +if len(stopfunc.LANG_BACKENDS) > 0: + stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] +if "cld3" in stopfunc.LANG_BACKENDS: + stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ + "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ + "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ + "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") +if "textblob" in stopfunc.LANG_BACKENDS: + stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ + "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ + "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") + + +def _detect(text): + _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) + if _lb is None: + raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) + return langid.classify(t)[0] if _lb == "langid" else \ + langdetect.detect(t) if _lb == "langdetect" else \ + pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ + cld3.get_language(t).language[:2] if _lb == "cld3" else \ + textblob.TextBlob(t).detect_language()[:2] + + +def _lang(lang): + def _test(s): + if not stopfunc.text(s): + return False + try: + return _detect(ensure_str(s))[:2] == lang + except: + return False + return _test + + +def _load_lang_backend(backend=None): + # import the requested backend library if not imported yet + if backend is None or backend in stopfunc.LANG_BACKENDS: + stopfunc.LANG_BACKEND = backend + if backend: + globals()[backend] = __import__(backend) + else: + raise ValueError("Unsupported language detection backend") + # remove language-related stop functions + for attr in dir(stopfunc): + if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): + continue + if re.match(r"lang_[a-z]{2}$", attr): + delattr(stopfunc, attr) + # rebind applicable language-related stop functions + if stopfunc.LANG_BACKEND: + _lb = stopfunc.LANG_BACKEND + if _lb == "langid": + langid.langid.load_model() + for lang in ( + langid.langid.identifier.nb_classes if _lb == "langid" else \ + list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ + list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ + stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ + stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ + []): + n = "lang_%s" % lang + setattr(stopfunc, n, _lang(lang)) + getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n + if LANG: + flng = "lang_%s" % LANG + if getattr(stopfunc, flng, None): + stopfunc.default = getattr(stopfunc, flng) +stopfunc._reload_lang = _load_lang_backend + + +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, str): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate + + +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): + """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ + if depth > min_depth and stop_func(input): + if not stop and (show or debug) and found not in result: + s = repr(input) + s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s + s = "[+] %s: %s" % (", ".join(found), s) + print(s if len(s) <= 80 else s[:77] + "...") + result[found] = input + if depth >= max_depth or len(result) > 0 and stop: + return + prev_enc = found[-1] if len(found) > 0 else "" + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): + if len(result) > 0 and stop: + return + if debug: + print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) + + +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = list(set(l)) + return d + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} + + +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): + """ Filter valid encodings and rank them by relevance. """ + ranking = {} + for e in encodings: + try: + codec = CODECS_CACHE[e] + except KeyError: + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): + yield result if yield_score else result[1], encoding + + +class _Text(object): + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] + + def __init__(self, text, pad_char=None): + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char + if self.padding: + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) + + +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): + """ Score relevant encodings given an input. """ + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = codec.decode(input)[0] + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + expf = expf(f, encoding) + except TypeError: + expf = expf(f) + if isinstance(expf, (int, float)): + tmp = expf + expf = (1/f - .1 <= 1/expf <= 1/f + .1) + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', lambda e: e) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input + + +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), + stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths + if max_depth <= 0: + raise ValueError("Depth must be a non-null positive integer") + if min_depth > max_depth: + raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account + if len(found) > 0: + for encoding in found: + input = decode(input, encoding) + # handle the stop function as a regex if a string was given + if isinstance(stop_func, str): + stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, str): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, result = __make_encodings_dict(include, exclude), {} + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} + return result +codecs.guess = guess + + +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) + return r[:limit] if len(r) > 1 else r +codecs.rank = rank + diff --git a/src/codext/__init__.py b/src/codext/__init__.py index f95abb8..67d6b5a 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -1,255 +1,257 @@ -# -*- coding: UTF-8 -*- -"""Codecs extension module. - -""" -from __future__ import print_function -from _codecs import lookup as orig_lookup -from ast import literal_eval -from six import binary_type, text_type - -from .__common__ import * -from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ - - -__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", - "reset"] - -decode = codecs.decode -encode = codecs.encode -guess = codecs.guess -lookup = codecs.lookup -open = codecs.open - -_lst = list -list = list_encodings # not included in __all__ because of shadow name - - -reset() - - -def __format_list(items, include=True): - if items is None: - return - d = {-1: list_encodings() if include else []} - for n, i in enumerate(items): - try: - depth, i = i.split(":") - depth = int(depth.strip().replace("~", "-")) - if depth < 0: - depth = -1 - except ValueError: - if n == 0: - d[-1] = [] - depth = -1 - d.setdefault(depth, []) - d[depth].append(i.strip()) - return d - - -def __print_tabular(lst, space=4): - try: - cols, _ = os.get_terminal_size() - # first, convert the list to a table that fits into the terminal - i, line, w = 0, "", [] - while i < len(lst): - x = lst[i] - l = len(x) - col = "%-{}s".format(l + space) % x - i += 1 - w.append(l) - if len(line) + len(col) > cols: - break - line += col - while True: - t = [lst[j:j+i] for j in range(0, len(lst), i)] - w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] - if sum(w) + space * len(w) >= cols: - i -= 1 - w.pop() - else: - break - print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") - except (AttributeError, OSError): - print(", ".join(lst) + "\n") - - -def main(): - import argparse, os - - class _CustomFormatter(argparse.RawTextHelpFormatter): - def __init__(self, prog, **kwargs): - kwargs['max_help_position'] = 32 - super(_CustomFormatter, self).__init__(prog, **kwargs) - - def _format_action_invocation(self, action): - if not action.option_strings: - metavar, = self._metavar_formatter(action, action.dest)(1) - return metavar - else: - return ", ".join(action.option_strings) - - descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ - "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ - .format(__version__, __author__, __email__, __copyright__, __license__, __source__) - examples = "usage examples:\n- " + "\n- ".join([ - "codext search bitcoin", - "codext decode base32 -i file.b32", - "codext encode morse < to_be_encoded.txt", - "echo \"test\" | codext encode base100", - "echo -en \"test\" | codext encode braille -o test.braille", - "codext encode base64 < to_be_encoded.txt > text.b64", - "echo -en \"test\" | codext encode base64 | codext encode base32", - "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", - "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", - "echo -en \"test\" | codext encode upper reverse base32 base64 morse", - "echo -en \"test\" | codext encode base64 gzip | codext guess", - "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", - ]) - kw = {'formatter_class': _CustomFormatter} - parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) - kw2 = {'required': True} if PY3 else {} - sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) - parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") - parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") - parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", - help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) - encode.add_argument("encoding", nargs="+", help="list of encodings to apply") - encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) - decode.add_argument("encoding", nargs="+", help="list of encodings to apply") - decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) - guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - lng = "lang_%s" % LANG - def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " - "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" - "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" - % def_func) - guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" - " the search but may be more accurate (default: False)") - guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") - if len(stopfunc.LANG_BACKENDS) > 0: - _lb = stopfunc.LANG_BACKEND - guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], - help="natural language detection backend (default: %s)" % _lb) - guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", - help="minimum codec search depth before triggering results (default: 0)") - guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", - help="maximum codec search depth (default: 5)") - guess.add_argument("-s", "--do-not-stop", action="store_true", - help="do not stop if a valid output is found (default: False)") - guess.add_argument("-v", "--verbose", action="store_true", - help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") - search = sparsers.add_parser("search", help="search for codecs") - search.add_argument("pattern", nargs="+", help="encoding pattern to search") - listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) - liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="+", help="selected categories") - listm = lsparsers.add_parser("macros", help="list macros") - addm = sparsers.add_parser("add-macro", help="add a macro to the registry") - addm.add_argument("name", help="macro's name") - addm.add_argument("encoding", nargs="+", help="list of encodings to chain") - remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") - remm.add_argument("name", help="macro's name") - args = parser.parse_args() - if args.command in ["guess", "rank"]: - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) - try: - # if a search pattern is given, only handle it - if args.command == "search": - results = [] - for enc in args.pattern: - results.extend(codecs.search(enc)) - print(", ".join(results) or "No encoding found") - return 0 - # add/remove macros (not requiring to input a file or text) - elif args.command == "add-macro": - add_macro(args.name, *args.encoding) - return 0 - elif args.command == "remove-macro": - remove_macro(args.name) - return 0 - # list encodings or macros - elif args.command == "list": - if args.type == "encodings": - cats = args.category or list_categories() - for c in sorted(cats): - l = list_encodings(c) - if len(l) > 0: - if len(cats) > 0: - print(c.upper() + ":") - __print_tabular(l) - elif args.type == "macros": - l = list_macros() - if len(l) > 0: - __print_tabular(l) - return 0 - # handle input file or stdin - c =_input(args.infile) - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - # strip any other (CR)LF - if args.strip: - c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") - if args.command in ["decode", "encode"]: - # encode or decode - for encoding in args.encoding: - c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) - # handle output file or stdout - if args.outfile: - with open(args.outfile, 'wb') as f: - f.write(c) - else: - print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") - elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, - args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show - not args.no_heuristic, args.extended, args.verbose) - for i, o in enumerate(r.items()): - e, out = o - if len(e) > 0: - if args.outfile: - n, ext = os.path.splitext(args.outfile) - fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) - else: - print("Codecs: %s" % ", ".join(e)) - print(ensure_str(out)) - if len(r) == 0: - print("Could not decode :-(") - elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): - s = "[+] %.5f: %s" % (i[0], e) - print(s if len(s) <= 80 else s[:77] + "...") - except Exception as e: - raise e - m = str(e) - print("codext: " + m[0].lower() + m[1:]) - +# -*- coding: UTF-8 -*- +"""Codecs extension module. + +""" +from .__common__ import * +from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ + + +__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", + "reset"] + +decode = codecs.decode +encode = codecs.encode +guess = codecs.guess +lookup = codecs.lookup +open = codecs.open + +_lst = list +list = list_encodings # not included in __all__ because of shadow name + + +reset() + + +# populate codext with attributes from codecs that were not modified +for attr in codecs.__all__: + if attr in __all__: + continue + locals()[attr] = getattr(codecs, attr) + __all__.append(attr) + + +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d + + +def __print_tabular(lst, space=4): + try: + cols, _ = os.get_terminal_size() + # first, convert the list to a table that fits into the terminal + i, line, w = 0, "", [] + while i < len(lst): + x = lst[i] + l = len(x) + col = "%-{}s".format(l + space) % x + i += 1 + w.append(l) + if len(line) + len(col) > cols: + break + line += col + while True: + t = [lst[j:j+i] for j in range(0, len(lst), i)] + w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] + if sum(w) + space * len(w) >= cols: + i -= 1 + w.pop() + else: + break + print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") + except (AttributeError, OSError): + print(", ".join(lst) + "\n") + + +def main(): + import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ + "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ + .format(__version__, __author__, __email__, __copyright__, __license__, __source__) + examples = "usage examples:\n- " + "\n- ".join([ + "codext search bitcoin", + "codext decode base32 -i file.b32", + "codext encode morse < to_be_encoded.txt", + "echo \"test\" | codext encode base100", + "echo -en \"test\" | codext encode braille -o test.braille", + "codext encode base64 < to_be_encoded.txt > text.b64", + "echo -en \"test\" | codext encode base64 | codext encode base32", + "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", + "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", + "echo -en \"test\" | codext encode upper reverse base32 base64 morse", + "echo -en \"test\" | codext encode base64 gzip | codext guess", + "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", + ]) + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + sparsers = parser.add_subparsers(dest="command", help="command to be executed", required=True) + parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") + parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") + parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", + help="strip newlines from input (default: False)") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) + encode.add_argument("encoding", nargs="+", help="list of encodings to apply") + encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) + decode.add_argument("encoding", nargs="+", help="list of encodings to apply") + decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) + guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + lng = "lang_%s" % LANG + def_func = lng if getattr(stopfunc, lng, None) else "text" + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) + guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" + " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") + if len(stopfunc.LANG_BACKENDS) > 0: + _lb = stopfunc.LANG_BACKEND + guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], + help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") + guess.add_argument("-s", "--do-not-stop", action="store_true", + help="do not stop if a valid output is found (default: False)") + guess.add_argument("-v", "--verbose", action="store_true", + help="show guessing information and steps (default: False)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") + search = sparsers.add_parser("search", help="search for codecs") + search.add_argument("pattern", nargs="+", help="encoding pattern to search") + listi = sparsers.add_parser("list", help="list items") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", required=True) + liste = lsparsers.add_parser("encodings", help="list encodings") + liste.add_argument("category", nargs="+", help="selected categories") + listm = lsparsers.add_parser("macros", help="list macros") + addm = sparsers.add_parser("add-macro", help="add a macro to the registry") + addm.add_argument("name", help="macro's name") + addm.add_argument("encoding", nargs="+", help="list of encodings to chain") + remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") + remm.add_argument("name", help="macro's name") + args = parser.parse_args() + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + try: + # if a search pattern is given, only handle it + if args.command == "search": + results = [] + for enc in args.pattern: + results.extend(codecs.search(enc)) + print(", ".join(results) or "No encoding found") + return 0 + # add/remove macros (not requiring to input a file or text) + elif args.command == "add-macro": + add_macro(args.name, *args.encoding) + return 0 + elif args.command == "remove-macro": + remove_macro(args.name) + return 0 + # list encodings or macros + elif args.command == "list": + if args.type == "encodings": + cats = args.category or list_categories() + for c in sorted(cats): + l = list_encodings(c) + if len(l) > 0: + if len(cats) > 0: + print(c.upper() + ":") + __print_tabular(l) + elif args.type == "macros": + l = list_macros() + if len(l) > 0: + __print_tabular(l) + return 0 + # handle input file or stdin + c =_input(args.infile) + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + # strip any other (CR)LF + if args.strip: + c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") + if args.command in ["decode", "encode"]: + # encode or decode + for encoding in args.encoding: + c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) + # handle output file or stdout + if args.outfile: + with open(args.outfile, 'wb') as f: + f.write(c) + else: + print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") + elif args.command == "guess": + s, lb = args.stop_function, args.lang_backend + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + r = codecs.guess(c, + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) + for i, o in enumerate(r.items()): + e, out = o + if len(e) > 0: + if args.outfile: + n, ext = os.path.splitext(args.outfile) + fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) + else: + print("Codecs: %s" % ", ".join(e)) + print(ensure_str(out)) + if len(r) == 0: + print("Could not decode :-(") + elif args.command == "rank": + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): + s = "[+] %.5f: %s" % (i[0], e) + print(s if len(s) <= 80 else s[:77] + "...") + except Exception as e: + raise e + m = str(e) + print("codext: " + m[0].lower() + m[1:]) + diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py index fce8b9a..27a31e3 100755 --- a/src/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -1,291 +1,290 @@ -# -*- coding: UTF-8 -*- -"""Generic baseN functions. - -""" -from argparse import ArgumentParser, RawTextHelpFormatter -from math import log -from six import integer_types, string_types -from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable -from textwrap import wrap as wraptext -from types import FunctionType, MethodType - -from ..__common__ import * -from ..__common__ import _set_exc -from ..__info__ import __version__ - - -_set_exc("BaseError") -_set_exc("BaseEncodeError") -_set_exc("BaseDecodeError") -""" -Curve fitting: - ->>> import matplotlib.pyplot as plt ->>> import pandas as pd ->>> import scipy.optimize ->>> from statistics import mean ->>> from tinyscript import random ->>> x, y = [], [] ->>> for i in range(2, 256): - v = [] - for j in range(16, 2048, 16): - s = random.randstr(j) - v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) - x.append(i) - y.append(mean(v)) ->>> data = pd.DataFrame({'base': x, 'expf': y}) ->>> def fit(x, y, func, params): - params, cv = scipy.optimize.curve_fit(func, x, y, params) - print(params) - y2 = func(x, *params) - plt.clf() - plt.plot(x, y, ".", color="blue", alpha=.3) - plt.plot(x, y2, color="red", linewidth=3.0) - plt.show() ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) -[ 0.02841434 0.00512664 -0.99999984 0.01543879] ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) -[ 0.02827357 0.00510124 -0.99999984 0.01536941] -""" -EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 -SIZE_LIMIT = 1024 * 1024 * 1024 - - -def _generate_charset(n): - """ Generate a characters set. - - :param n: size of charset - """ - if 1 < n <= len(printable): - return printable[:n] - elif len(printable) < n < 256: - return "".join(chr(i) for i in range(n)) - raise ValueError("Bad size of character set") - - -def _get_charset(charset, p=""): - """ Characters set selection function. It allows to define charsets in many different ways. - - :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset - depending on the input parameter) or a dictionary (either by exact key or by pattern matching) - :param p: the parameter for choosing the charset - """ - # case 1: charset is a function, so return its result - if isinstance(charset, FunctionType): - return charset(p) - # case 2: charset is a string, so return it - elif isinstance(charset, string_types): - return charset - # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters - # that can be inverted - elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: - return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] - # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs - elif isinstance(charset, dict): - # try to handle [p]arameter as a simple key - try: - return charset[p] - except KeyError: - pass - # or handle [p]arameter as a pattern - default, n, best = None, None, None - for pattern, cset in charset.items(): - n = len(cset) - if re.match(pattern, ""): - default = cset - continue - m = re.match(pattern, p) - if m: # find the longest match from the patterns - s, e = m.span() - if e - s > len(best or ""): - best = pattern - if best: - return charset[best] - # special case: the given [p]arameter can be the charset itself if it has the right length - p = re.sub(r"^[-_]+", "", p) - if len(p) == n: - return p - # or simply rely on key '' - if default is not None: - return default - raise ValueError("Bad charset descriptor ('%s')" % p) - - -# generic base en/decoding functions -def base_encode(input, charset, errors="strict", exc=BaseEncodeError): - """ Base-10 to base-N encoding. - - :param input: input (str or int) to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), "" - if n == 1: - if i > SIZE_LIMIT: - raise InputSizeLimitError("Input exceeded size limit") - return i * charset[0] - if n == 10: - return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) - while i > 0: - i, c = divmod(i, n) - r = charset[c] + r - return r - - -def base_decode(input, charset, errors="strict", exc=BaseDecodeError): - """ Base-N to base-10 decoding. - - :param input: input to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) - if n == 1: - return i2s(len(input)) - if n == 10: - return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) - for k, c in enumerate(input): - try: - i = i * n + charset.index(c) - except ValueError: - handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) - return dec(i) - - -# base codec factory functions -def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): - """ Base-N codec factory. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param pow2: whether the base codec's N is a power of 2 - """ - cs = _get_charset(charset) - n = len(cs) - nb = log(n, 2) - if pow2 and nb != int(nb): - raise BaseError("Bad charset ; {} is not a power of 2".format(n)) - - def encode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - def _encode(input, errors="strict"): - if len(input) == 0: - return "", 0 - return encode_template(input, a, errors), len(input) - return _encode - - def decode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - if len(input) == 0: - return "", 0 - input = _stripl(input, sc, sl) - return decode_template(input, a, errors), len(input) - return _decode - - kwargs['len_charset'] = n - kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) - kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) - n = "base{}".format(n) if name is None else name - try: - g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] - except AttributeError: - g = [n] - kwargs['guess'] = kwargs.get('guess', g) - add(n, encode, decode, pattern, entropy=nb, **kwargs) - - -def base_generic(): - """ Base-N generic codec. """ - def encode(n): - a = _generate_charset(int(n)) - def _encode(input, errors="strict"): - return base_encode(input, a, errors), len(input) - return _encode - - def decode(n): - a = _generate_charset(int(n)) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - input = _stripl(input, sc, sl) - return base_decode(input, a, errors), len(input) - return _decode - - add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", - guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), - len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, - expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) - - -def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): - base = str(n) + ("-" + alt.lstrip("-") if alt else "") - src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ - {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" - text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ - "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ - " encoded stream." % {'base': base, 'source': src} - text = "\n".join(x for x in wraptext(text, 74)) - descr = """Usage: base%(base)s [OPTION]... [FILE] -Base%(base)s encode or decode FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Mandatory arguments to long options are mandatory for short options too. - -d, --decode decode data - -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s%(swap)s%(wrap)s - - --help display this help and exit - --version output version information and exit - -%(text)s - -Report base%(base)s translation bugs to -Full documentation at: -""" % {'base': base, 'text': text, - 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], - 'swap': ["", " -s, --swapcase swap the case\n"][swap], - 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ - "Use 0 to disable line wrapping"][wrap]} - - def _main(): - p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - p.format_help = MethodType(lambda s: s.description, p) - p.add_argument("file", nargs="?") - p.add_argument("-d", "--decode", action="store_true") - p.add_argument("-i", "--ignore-garbage", action="store_true") - if inv: - p.add_argument("-I", "--invert", action="store_true") - if swap: - p.add_argument("-s", "--swapcase", action="store_true") - if wrap: - p.add_argument("-w", "--wrap", type=int, default=76) - p.add_argument("--help", action="help") - p.add_argument("--version", action="version") - p.version = "CodExt " + __version__ - args = p.parse_args() - if args.decode: - args.wrap = 0 - args.invert = getattr(args, "invert", False) - c, f = _input(args.file), [encode, decode][args.decode] - if swap and args.swapcase and args.decode: - c = codecs.decode(c, "swapcase") - c = b(c).rstrip(b"\r\n") - try: - c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], - ["strict", "ignore"][args.ignore_garbage]) - except Exception as err: - print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) - return 1 - c = ensure_str(c) - if swap and args.swapcase and not args.decode: - c = codecs.encode(c, "swapcase") - for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): - print(l) - return 0 - return _main - +# -*- coding: UTF-8 -*- +"""Generic baseN functions. + +""" +from argparse import ArgumentParser, RawTextHelpFormatter +from math import log +from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from textwrap import wrap as wraptext +from types import FunctionType, MethodType + +from ..__common__ import * +from ..__common__ import _set_exc +from ..__info__ import __version__ + + +_set_exc("BaseError") +_set_exc("BaseEncodeError") +_set_exc("BaseDecodeError") +""" +Curve fitting: + +>>> import matplotlib.pyplot as plt +>>> import pandas as pd +>>> import scipy.optimize +>>> from statistics import mean +>>> from tinyscript import random +>>> x, y = [], [] +>>> for i in range(2, 256): + v = [] + for j in range(16, 2048, 16): + s = random.randstr(j) + v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) + x.append(i) + y.append(mean(v)) +>>> data = pd.DataFrame({'base': x, 'expf': y}) +>>> def fit(x, y, func, params): + params, cv = scipy.optimize.curve_fit(func, x, y, params) + print(params) + y2 = func(x, *params) + plt.clf() + plt.plot(x, y, ".", color="blue", alpha=.3) + plt.plot(x, y2, color="red", linewidth=3.0) + plt.show() +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) +[ 0.02841434 0.00512664 -0.99999984 0.01543879] +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) +[ 0.02827357 0.00510124 -0.99999984 0.01536941] +""" +EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 +SIZE_LIMIT = 1024 * 1024 * 1024 + + +def _generate_charset(n): + """ Generate a characters set. + + :param n: size of charset + """ + if 1 < n <= len(printable): + return printable[:n] + elif len(printable) < n < 256: + return "".join(chr(i) for i in range(n)) + raise ValueError("Bad size of character set") + + +def _get_charset(charset, p=""): + """ Characters set selection function. It allows to define charsets in many different ways. + + :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset + depending on the input parameter) or a dictionary (either by exact key or by pattern matching) + :param p: the parameter for choosing the charset + """ + # case 1: charset is a function, so return its result + if isinstance(charset, FunctionType): + return charset(p) + # case 2: charset is a string, so return it + elif isinstance(charset, str): + return charset + # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters + # that can be inverted + elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: + return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] + # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs + elif isinstance(charset, dict): + # try to handle [p]arameter as a simple key + try: + return charset[p] + except KeyError: + pass + # or handle [p]arameter as a pattern + default, n, best = None, None, None + for pattern, cset in charset.items(): + n = len(cset) + if re.match(pattern, ""): + default = cset + continue + m = re.match(pattern, p) + if m: # find the longest match from the patterns + s, e = m.span() + if e - s > len(best or ""): + best = pattern + if best: + return charset[best] + # special case: the given [p]arameter can be the charset itself if it has the right length + p = re.sub(r"^[-_]+", "", p) + if len(p) == n: + return p + # or simply rely on key '' + if default is not None: + return default + raise ValueError("Bad charset descriptor ('%s')" % p) + + +# generic base en/decoding functions +def base_encode(input, charset, errors="strict", exc=BaseEncodeError): + """ Base-10 to base-N encoding. + + :param input: input (str or int) to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, r = input if isinstance(input, int) else s2i(input), len(charset), "" + if n == 1: + if i > SIZE_LIMIT: + raise InputSizeLimitError("Input exceeded size limit") + return i * charset[0] + if n == 10: + return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) + while i > 0: + i, c = divmod(i, n) + r = charset[c] + r + return r + + +def base_decode(input, charset, errors="strict", exc=BaseDecodeError): + """ Base-N to base-10 decoding. + + :param input: input to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) + if n == 1: + return i2s(len(input)) + if n == 10: + return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) + for k, c in enumerate(input): + try: + i = i * n + charset.index(c) + except ValueError: + handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) + return dec(i) + + +# base codec factory functions +def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): + """ Base-N codec factory. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param pow2: whether the base codec's N is a power of 2 + """ + cs = _get_charset(charset) + n = len(cs) + nb = log(n, 2) + if pow2 and nb != int(nb): + raise BaseError("Bad charset ; {} is not a power of 2".format(n)) + + def encode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + def _encode(input, errors="strict"): + if len(input) == 0: + return "", 0 + return encode_template(input, a, errors), len(input) + return _encode + + def decode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + if len(input) == 0: + return "", 0 + input = _stripl(input, sc, sl) + return decode_template(input, a, errors), len(input) + return _decode + + kwargs['len_charset'] = n + kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) + kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) + n = "base{}".format(n) if name is None else name + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) + add(n, encode, decode, pattern, entropy=nb, **kwargs) + + +def base_generic(): + """ Base-N generic codec. """ + def encode(n): + a = _generate_charset(int(n)) + def _encode(input, errors="strict"): + return base_encode(input, a, errors), len(input) + return _encode + + def decode(n): + a = _generate_charset(int(n)) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + input = _stripl(input, sc, sl) + return base_decode(input, a, errors), len(input) + return _decode + + add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", + guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), + len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, + expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) + + +def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): + base = str(n) + ("-" + alt.lstrip("-") if alt else "") + src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ + {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" + text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ + "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ + " encoded stream." % {'base': base, 'source': src} + text = "\n".join(x for x in wraptext(text, 74)) + descr = """Usage: base%(base)s [OPTION]... [FILE] +Base%(base)s encode or decode FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Mandatory arguments to long options are mandatory for short options too. + -d, --decode decode data + -i, --ignore-garbage when decoding, ignore non-alphabet characters +%(inv)s%(swap)s%(wrap)s + + --help display this help and exit + --version output version information and exit + +%(text)s + +Report base%(base)s translation bugs to +Full documentation at: +""" % {'base': base, 'text': text, + 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], + 'swap': ["", " -s, --swapcase swap the case\n"][swap], + 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ + "Use 0 to disable line wrapping"][wrap]} + + def _main(): + p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + p.format_help = MethodType(lambda s: s.description, p) + p.add_argument("file", nargs="?") + p.add_argument("-d", "--decode", action="store_true") + p.add_argument("-i", "--ignore-garbage", action="store_true") + if inv: + p.add_argument("-I", "--invert", action="store_true") + if swap: + p.add_argument("-s", "--swapcase", action="store_true") + if wrap: + p.add_argument("-w", "--wrap", type=int, default=76) + p.add_argument("--help", action="help") + p.add_argument("--version", action="version") + p.version = "CodExt " + __version__ + args = p.parse_args() + if args.decode: + args.wrap = 0 + args.invert = getattr(args, "invert", False) + c, f = _input(args.file), [encode, decode][args.decode] + if swap and args.swapcase and args.decode: + c = codecs.decode(c, "swapcase") + c = b(c).rstrip(b"\r\n") + try: + c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], + ["strict", "ignore"][args.ignore_garbage]) + except Exception as err: + print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) + return 1 + c = ensure_str(c) + if swap and args.swapcase and not args.decode: + c = codecs.encode(c, "swapcase") + for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): + print(l) + return 0 + return _main + diff --git a/src/codext/base/base100.py b/src/codext/base/base100.py index f5faa1d..2287463 100755 --- a/src/codext/base/base100.py +++ b/src/codext/base/base100.py @@ -1,56 +1,47 @@ -# -*- coding: UTF-8 -*- -"""Base100 Codec - base100 content encoding. - -Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -# no __examples__ ; handled manually in tests/test_base.py - - -def base100_encode(input, errors="strict"): - raise NotImplementedError - - -def base100_decode(input, errors="strict"): - raise NotImplementedError - - -if PY3: - class Base100DecodeError(ValueError): - __module__ = "builtins" - - def base100_encode(input, errors="strict"): - input = b(input) - r = [240, 159, 0, 0] * len(input) - for i, c in enumerate(input): - r[4*i+2] = (c + 55) // 64 + 143 - r[4*i+3] = (c + 55) % 64 + 128 - return bytes(r), len(input) - - def base100_decode(input, errors="strict"): - input = b(_stripl(input, True, True)) - if errors == "ignore": - input = input.replace(b"\n", "") - if len(input) % 4 != 0: - raise Base100DecodeError("Bad input (length should be multiple of 4)") - r = [None] * (len(input) // 4) - for i, c in enumerate(input): - if i % 4 == 2: - tmp = ((c - 143) * 64) % 256 - elif i % 4 == 3: - r[i//4] = (c - 128 + tmp - 55) & 0xff - return bytes(r), len(input) - - -add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) -main100 = main(100, "") - +# -*- coding: UTF-8 -*- +"""Base100 Codec - base100 content encoding. + +Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py + +class Base100DecodeError(ValueError): + __module__ = "builtins" + + +def base100_encode(input, errors="strict"): + input = b(input) + r = [240, 159, 0, 0] * len(input) + for i, c in enumerate(input): + r[4*i+2] = (c + 55) // 64 + 143 + r[4*i+3] = (c + 55) % 64 + 128 + return bytes(r), len(input) + + +def base100_decode(input, errors="strict"): + input = b(_stripl(input, True, True)) + if errors == "ignore": + input = input.replace(b"\n", b"") + if len(input) % 4 != 0: + raise Base100DecodeError("Bad input (length should be multiple of 4)") + r = [None] * (len(input) // 4) + for i, c in enumerate(input): + if i % 4 == 2: + tmp = ((c - 143) * 64) % 256 + elif i % 4 == 3: + r[i//4] = (c - 128 + tmp - 55) & 0xff + return bytes(r), len(input) + + +add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) +main100 = main(100, "") + diff --git a/src/codext/base/base122.py b/src/codext/base/base122.py index f580ff8..b326341 100755 --- a/src/codext/base/base122.py +++ b/src/codext/base/base122.py @@ -1,106 +1,98 @@ -# -*- coding: UTF-8 -*- -"""Base122 Codec - base122 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -__examples__ = { - 'enc(base122|base-122)': { - 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", - b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ - b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" - b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" - }, - 'enc-dec(base_122)': ["@random"], -} if PY3 else {'enc(base122': None} - - -_BAD = [0, 10, 13, 34, 38, 92] -_i = lambda c: c if isinstance(c, int) else ord(c) - - -def base122_encode(input, errors='strict'): - raise NotImplementedError - - -def base122_decode(input, errors='strict'): - raise NotImplementedError - - -if PY3: - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_encode(input, errors="strict"): - idx, bit, r, l = 0, 0, [], len(input) - - def _get_7bits(idx, bit): - if idx >= l: - return idx, bit, False - B1 = _i(input[idx]) - p1 = (((254 >> bit) & B1) << bit) >> 1 - bit += 7 - if bit < 8: - return idx, bit, p1 - bit -= 8 - idx += 1 - if idx >= l: - return idx, bit, p1 - B2 = _i(input[idx]) - p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) - return idx, bit, (p1 | p2) - - while True: - if idx >= l: - break - # get seven bits of input data - idx, bit, B = _get_7bits(idx, bit) - # check for illegal chars - try: - bad_idx = _BAD.index(B) - except ValueError: - r.append(B) - continue - idx, bit, nB = _get_7bits(idx, bit) - if nB is False: - nB, bad_idx = B, 7 - B1, B2 = 194, 128 - B1 |= (7 & bad_idx) << 2 - B1 |= int((nB & 64) > 0) - B2 |= nB & 63 - r.extend([B1, B2]) - return "".join(map(chr, r)).encode("latin-1"), len(input) - - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_decode(input, errors="strict"): - currB, bob, r, input = 0, 0, [], list(map(ord, input)) - - def _get_7bits(currB, bob, B, decoded): - B <<= 1 - currB |= (B % 0x100000000) >> bob - bob += 7 - if bob >= 8: - decoded += [currB] - bob -= 8 - return (B << (7 - bob)) & 255, bob - - for i in range(len(input)): - if input[i] >= 128: - try: - currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) - except IndexError: - pass - currB, bob = _get_7bits(currB, bob, input[i] & 127, r) - else: - currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)).rstrip("\0"), len(input) - - -add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "", wrap=False) - +# -*- coding: UTF-8 -*- +"""Base122 Codec - base122 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + + +__examples__ = { + 'enc(base122|base-122)': { + 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", + b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ + b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" + b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" + }, + 'enc-dec(base_122)': ["@random"], +} + + +_BAD = [0, 10, 13, 34, 38, 92] +_i = lambda c: c if isinstance(c, int) else ord(c) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_encode(input, errors="strict"): + idx, bit, r, l = 0, 0, [], len(input) + + def _get_7bits(idx, bit): + if idx >= l: + return idx, bit, False + B1 = _i(input[idx]) + p1 = (((254 >> bit) & B1) << bit) >> 1 + bit += 7 + if bit < 8: + return idx, bit, p1 + bit -= 8 + idx += 1 + if idx >= l: + return idx, bit, p1 + B2 = _i(input[idx]) + p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) + return idx, bit, (p1 | p2) + + while True: + if idx >= l: + break + # get seven bits of input data + idx, bit, B = _get_7bits(idx, bit) + # check for illegal chars + try: + bad_idx = _BAD.index(B) + except ValueError: + r.append(B) + continue + idx, bit, nB = _get_7bits(idx, bit) + if nB is False: + nB, bad_idx = B, 7 + B1, B2 = 194, 128 + B1 |= (7 & bad_idx) << 2 + B1 |= int((nB & 64) > 0) + B2 |= nB & 63 + r.extend([B1, B2]) + return "".join(map(chr, r)).encode("latin-1"), len(input) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_decode(input, errors="strict"): + currB, bob, r, input = 0, 0, [], list(map(ord, input)) + + def _get_7bits(currB, bob, B, decoded): + B <<= 1 + currB |= (B % 0x100000000) >> bob + bob += 7 + if bob >= 8: + decoded += [currB] + bob -= 8 + return (B << (7 - bob)) & 255, bob + + for i in range(len(input)): + if input[i] >= 128: + try: + currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) + except IndexError: + pass + currB, bob = _get_7bits(currB, bob, input[i] & 127, r) + else: + currB, bob = _get_7bits(currB, bob, input[i], r) + return "".join(map(chr, r)).rstrip("\0"), len(input) + + +add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) +main122 = main(122, "", wrap=False) + diff --git a/src/codext/base/base85.py b/src/codext/base/base85.py index bc6d8b2..22aad28 100755 --- a/src/codext/base/base85.py +++ b/src/codext/base/base85.py @@ -1,186 +1,185 @@ -# -*- coding: UTF-8 -*- -"""Base85 Codec - base85 content encoding. - -This is a simple wrapper for adding base64.b85**code to the codecs. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import base64 -from six import integer_types - -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], - 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], - 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, - 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", - 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, - 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, - 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, - 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, - 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ - " S 523 R 1b132e"}, - 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, - 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': - None}, - 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, - 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, -} -__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] - - -B85 = { - r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], - r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", - r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", - r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", -} -B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] -POW85 = [85 ** i for i in range(5)] - - -def __format(text, mode, decode=False, **kwargs): - if "adobe" in mode: - if decode: - if text.startswith("<~") and text.endswith("~>"): - text = text[2:-2] - else: - text = "<~" + text + "~>" - elif "xbtoa" in mode: - sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" - if decode: - if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ - re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): - text = "".join(text.split("\n")[1:-1]).replace(" ", "") - elif not decode: - l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) - text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ - (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) - return text - - -def __xbtoa_values(text): - try: - hr = "[0-9a-fA-F]+" - return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() - except: - raise Base85DecodeError("Bad or missing xbtoa parameters") - - -def base85_encode(mode): - b85 = _get_charset(B85, mode) - def encode(input, errors="strict"): - r, l, kw = "", len(input), {} - if l == 0: - return input, 0 - if "xbtoa" in mode: - kw['length'] = l - kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 - n_pad = (4 - l % 4) % 4 - for i in range(0, l, 4): - block = input[i:i+4] - if block == "\0\0\0\0" and b85[-3:] == "stu": - r += "z" - if block == "\x20\x20\x20\x20" and "btoa" in mode: - r += "y" - if "xbtoa" in mode: - for c in block: - k = ord(c) - kw['c_xor'] ^= k - kw['c_sum'] += k + 1 - kw['c_rot'] <<= 1 - if kw['c_rot'] & 0x80000000: - kw['c_rot'] += 1 - kw['c_rot'] += k - if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: - continue - if len(block) < 4: - block += n_pad * "\0" - n, bl = s2i(block), "" - for _ in range(5): - n, k = divmod(n, 85) - bl = b85[k] + bl - r += bl - if "btoa" not in mode and n_pad: - r = r[:-n_pad] - if b85[-3:] == "stu" and r[-5:] == "!!!!!": - r = r[:-5] + "z" - return __format(r, mode, **kw), l - return encode - - -def base85_decode(mode): - b85 = _get_charset(B85, mode) - def decode(input, errors="strict"): - r, l, i, n_pad = "", len(input), 0, 0 - if l == 0: - return input, 0 - if "xbtoa" in mode: - v = __xbtoa_values(input) - n_last = int(v[0]) % 4 - c_xor, c_sum, c_rot = 0, 0, 0 - input = __format(input, mode, True) - ehandler = handle_error("base85", errors, decode=True) - if b85[-3:] == "stu" and input[-1] == "z": - input = input[:-1] + "!!!!!" - l = len(input) - while i < l: - n, incr = 0, 5 - if input[i] == "z" and b85[-3:] == "stu": - bl, incr = "\0\0\0\0", 1 - elif input[i] == "y" and "btoa" in mode: - bl, incr = "\x20\x20\x20\x20", 1 - else: - block = input[i:i+5] - if len(block) < 5: - n_pad = 5 - len(block) % 5 - block += n_pad * "\0" - for k, c in enumerate(block[::-1]): - try: - n += (b85.index(c) if c != "\0" else 255) * POW85[k] - except ValueError: - r += ehandler(c, i + k, r) - bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") - if "xbtoa" in mode: - if i + 5 == l and n_last > 0: - bl = bl[:n_last] - for c in bl: - k = ord(c) - c_xor ^= k - c_sum += k + 1 - c_rot <<= 1 - if c_rot & 0x80000000: - c_rot += 1 - c_rot += k - r += bl - i += incr - if n_pad > 0: - r = r[:-n_pad] - if "xbtoa" in mode: - chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] - if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": - raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), - str(chkv).replace("'", ""))) - return r, l - return decode - - -add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, - pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", - extra_exceptions=["Base85ValueError"]) -main85 = main(85, None) -main85adobe = main(85, None, "adobe") -main85xbtoa = main(85, None, "xbtoa", wrap=False) -main85rfc1924 = main(85, "RFC 1924", "ipv6") -main85xml = main(85, "", "xml") -main85zeromq = main(85, "", "zeromq") - +# -*- coding: UTF-8 -*- +"""Base85 Codec - base85 content encoding. + +This is a simple wrapper for adding base64.b85**code to the codecs. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import base64 + +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], + 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], + 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, + 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", + 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, + 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, + 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, + 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, + 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ + " S 523 R 1b132e"}, + 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, + 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': + None}, + 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, + 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, +} +__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] + + +B85 = { + r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], + r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", + r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", + r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", +} +B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] +POW85 = [85 ** i for i in range(5)] + + +def __format(text, mode, decode=False, **kwargs): + if "adobe" in mode: + if decode: + if text.startswith("<~") and text.endswith("~>"): + text = text[2:-2] + else: + text = "<~" + text + "~>" + elif "xbtoa" in mode: + sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" + if decode: + if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ + re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): + text = "".join(text.split("\n")[1:-1]).replace(" ", "") + elif not decode: + l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) + text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ + (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) + return text + + +def __xbtoa_values(text): + try: + hr = "[0-9a-fA-F]+" + return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() + except: + raise Base85DecodeError("Bad or missing xbtoa parameters") + + +def base85_encode(mode): + b85 = _get_charset(B85, mode) + def encode(input, errors="strict"): + r, l, kw = "", len(input), {} + if l == 0: + return input, 0 + if "xbtoa" in mode: + kw['length'] = l + kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 + n_pad = (4 - l % 4) % 4 + for i in range(0, l, 4): + block = input[i:i+4] + if block == "\0\0\0\0" and b85[-3:] == "stu": + r += "z" + if block == "\x20\x20\x20\x20" and "btoa" in mode: + r += "y" + if "xbtoa" in mode: + for c in block: + k = ord(c) + kw['c_xor'] ^= k + kw['c_sum'] += k + 1 + kw['c_rot'] <<= 1 + if kw['c_rot'] & 0x80000000: + kw['c_rot'] += 1 + kw['c_rot'] += k + if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: + continue + if len(block) < 4: + block += n_pad * "\0" + n, bl = s2i(block), "" + for _ in range(5): + n, k = divmod(n, 85) + bl = b85[k] + bl + r += bl + if "btoa" not in mode and n_pad: + r = r[:-n_pad] + if b85[-3:] == "stu" and r[-5:] == "!!!!!": + r = r[:-5] + "z" + return __format(r, mode, **kw), l + return encode + + +def base85_decode(mode): + b85 = _get_charset(B85, mode) + def decode(input, errors="strict"): + r, l, i, n_pad = "", len(input), 0, 0 + if l == 0: + return input, 0 + if "xbtoa" in mode: + v = __xbtoa_values(input) + n_last = int(v[0]) % 4 + c_xor, c_sum, c_rot = 0, 0, 0 + input = __format(input, mode, True) + ehandler = handle_error("base85", errors, decode=True) + if b85[-3:] == "stu" and input[-1] == "z": + input = input[:-1] + "!!!!!" + l = len(input) + while i < l: + n, incr = 0, 5 + if input[i] == "z" and b85[-3:] == "stu": + bl, incr = "\0\0\0\0", 1 + elif input[i] == "y" and "btoa" in mode: + bl, incr = "\x20\x20\x20\x20", 1 + else: + block = input[i:i+5] + if len(block) < 5: + n_pad = 5 - len(block) % 5 + block += n_pad * "\0" + for k, c in enumerate(block[::-1]): + try: + n += (b85.index(c) if c != "\0" else 255) * POW85[k] + except ValueError: + r += ehandler(c, i + k, r) + bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") + if "xbtoa" in mode: + if i + 5 == l and n_last > 0: + bl = bl[:n_last] + for c in bl: + k = ord(c) + c_xor ^= k + c_sum += k + 1 + c_rot <<= 1 + if c_rot & 0x80000000: + c_rot += 1 + c_rot += k + r += bl + i += incr + if n_pad > 0: + r = r[:-n_pad] + if "xbtoa" in mode: + chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] + if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": + raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), + str(chkv).replace("'", ""))) + return r, l + return decode + + +add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, + pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", + extra_exceptions=["Base85ValueError"]) +main85 = main(85, None) +main85adobe = main(85, None, "adobe") +main85xbtoa = main(85, None, "xbtoa", wrap=False) +main85rfc1924 = main(85, "RFC 1924", "ipv6") +main85xml = main(85, "", "xml") +main85zeromq = main(85, "", "zeromq") + diff --git a/src/codext/binary/baudot.py b/src/codext/binary/baudot.py index a57e1ea..1cdd111 100755 --- a/src/codext/binary/baudot.py +++ b/src/codext/binary/baudot.py @@ -1,295 +1,281 @@ -# -*- coding: UTF-8 -*- -"""Baudot Codec - baudot content conversion to HTML. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] -if PY3: - __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) -__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] -__examples1__ = { - 'enc(baudot-BAD_ALPHABET)': None, - 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, - 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot)': {'\x01\x02': None}, - 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, -} -__examples2__ = { - 'enc(baudot_spaced-BAD_ALPHABET)': None, - 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, - 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, - 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, -} -__examples3__ = { - 'enc(baudot_tape-BAD_ALPHABET)': None, - 'enc(baudot_tape-ita1)': { - 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", - }, - 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, - 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, - 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, -} -if PY3: - __examples1__.update({ - 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, - 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - }) - __examples2__.update({ - 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, - 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - }) - - -PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" -# reserved character -RES_CHR = "\xff" - -# sources: -# - http://rabbit.eng.miami.edu/info/baudot.html -# - https://en.wikipedia.org/wiki/Baudot_code -# - https://fr.qwe.wiki/wiki/Baudot_code -# all alphabets consider MSB by default -# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT1 = [ - "00001", "00010", - "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+" if PY3 else \ - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff$5'0+", -] -# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -BAUDOT = EU = FR = [ - "10000", "01000", - "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP" if PY3 else "\x00AEeYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", - "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%" if PY3 else "\x0012&34o5 67h89f0\xff.,:;!?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA1 = [ - "10000", "01000", - "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", - "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", -] -# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2_US = US_TTY = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - ITA2_METEO = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", - ] -# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) -if PY3: - MTK2 = [ - "11111", "11011", - "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", - "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", - ] -# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", - ] -# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", - ] - - -def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): - """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ - bits = "" - trans_rev = {v: k for k, v in trans.items()} - for i, line in enumerate(tape.splitlines()): - if i == 0: - if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: - raise ValueError("Bad tape header '{}'".format(line)) - else: - line = line[:3] + line[4:] - if len(line) != 5: - raise ValueError("Bad tape line '{}'".format(line)) - bits += "".join(trans.get(c, "") for c in line) - return bits - - -def _bits_to_tape(bits, trans={'1': "*", '0': " "}): - """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ - tape = [trans['1'] * 3 + "." + trans['1'] * 2] - for i in range(0, len(bits), 5): - group = "".join(trans[b] for b in bits[i:i+5]) - tape.append(group[:3] + "." + group[3:]) - return "\n".join(tape) - - -def _check_alphabet(alphabet): - """ Checks the length of letters and figures (must be 32 chars). """ - for chars in alphabet: - l = len(chars) - if l != 32: - raise ValueError("Bad length of alphabet (%d instead of 32)" % l) - - -def _handle_alphabet(alphabet): - """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ - alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") - if "_lsb" in alphabet: - alphabet = alphabet.replace("_lsb", "") - func = lambda x: x[::-1] - else: - alphabet = alphabet.replace("_msb", "") - func = lambda x: x - _ = globals()[alphabet.upper()] - st, a = _[:2], _[2:] - _check_alphabet(a) - alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ - zip(["letters", "figures"], a)} - return alphabet, {'letters': st[0], 'figures': st[1]}, func - - -def baudot_encode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - def encode(text, errors="strict"): - text = text.upper() - s, l, state, seen_states = "", len(b(text)), None, [] - for i, c in enumerate(text): - # if the state is undefined yet, find the relevant alphabet - if state is None: - bits= None - for st in states.keys(): - try: - bits = func(alphabet[st][c]) - state = st - if st not in seen_states: - seen_states.append(st) - break - except KeyError: - pass - if bits is None: - bits = handle_error(ename, errors, "?", 5)(c, i) - s += bits - # otherwise, handle state change (when the current alphabet does not contain the character to encode but the - # other alphabet does - else: - try: - s += func(alphabet[state][c]) - continue - except KeyError: - state = list(set(states.keys()) - {state})[0] - try: - s += func(states[state]) + func(alphabet[state][c]) - if state not in seen_states: - seen_states.append(state) - except KeyError as e: - state = list(set(states.keys()) - {state})[0] # reset the state - s += handle_error(ename, errors, "?", 5)(c, i) - # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, - # it is necessary to include the groups of bits for figures at the beginning of the encoded string - s = (states['figures'] if seen_states == ["figures"] else "") + s - if spaced: - s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) - elif tape: - s = _bits_to_tape(s) - return s, l - return encode - - -def baudot_decode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} - states = {v: k for k, v in states.items()} - def decode(text, errors="strict"): - s, l = "", len(b(text)) - if spaced: - text = text.replace(" ", "") - elif tape: - text = _bits_from_tape(text) - # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; - # by default, we assume letters - state = "letters" - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching - # a state change - if bits in states.keys(): - error = False - # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid - # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a - # valid state change and not simply a character, and we can set it as the starting state - for j in range(i-5, 0, -5): - try: - alphabet[states[bits]][text[j:j+5]] - except KeyError: - error = True - break - if error: - state = list(set(states.values()) - {states[bits]})[0] - break - # now parse the input text - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - try: - s += alphabet[state][bits] - except KeyError: - if bits in states.keys() and states[bits] != state: - state = states[bits] - else: - s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) - return s, l - return decode - - -add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], - entropy=1., printables_rate=1.) - - -baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) -baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) -add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, - guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) - - -baudot_tape_encode = lambda a: baudot_encode(a, tape=True) -baudot_tape_decode = lambda a: baudot_decode(a, tape=True) -add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, - guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Baudot Codec - baudot content conversion to HTML. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_meteo", "ita2_us", "mtk2", "murray", "uk"] +__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] +__examples1__ = { + 'enc(baudot-BAD_ALPHABET)': None, + 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, + 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, + 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot)': {'\x01\x02': None}, + 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, + 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, + 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, +} +__examples2__ = { + 'enc(baudot_spaced-BAD_ALPHABET)': None, + 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, + 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, + 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, + 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, +} +__examples3__ = { + 'enc(baudot_tape-BAD_ALPHABET)': None, + 'enc(baudot_tape-ita1)': { + 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", + }, + 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, + 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, + 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, +} + + +PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us|meteo)|mtk2|murray|uk|us_tty)" + \ + r"(?:[-_](?:lsb|msb))?)?$" +# reserved character +RES_CHR = "\xff" + +# sources: +# - http://rabbit.eng.miami.edu/info/baudot.html +# - https://en.wikipedia.org/wiki/Baudot_code +# - https://fr.qwe.wiki/wiki/Baudot_code +# all alphabets consider MSB by default +# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT1 = [ + "00001", "00010", + "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", + "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+", +] +# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +BAUDOT = EU = FR = [ + "10000", "01000", + "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", + "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%", +] +# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA1 = [ + "10000", "01000", + "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", + "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", +] +# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", +] +# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2_US = US_TTY = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) +ITA2_METEO = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", +] +# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) +MTK2 = [ + "11111", "11011", + "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", + "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", +] +# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) +MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*", +] +# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+", +] + + +def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): + """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ + bits = "" + trans_rev = {v: k for k, v in trans.items()} + for i, line in enumerate(tape.splitlines()): + if i == 0: + if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: + raise ValueError("Bad tape header '{}'".format(line)) + else: + line = line[:3] + line[4:] + if len(line) != 5: + raise ValueError("Bad tape line '{}'".format(line)) + bits += "".join(trans.get(c, "") for c in line) + return bits + + +def _bits_to_tape(bits, trans={'1': "*", '0': " "}): + """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ + tape = [trans['1'] * 3 + "." + trans['1'] * 2] + for i in range(0, len(bits), 5): + group = "".join(trans[b] for b in bits[i:i+5]) + tape.append(group[:3] + "." + group[3:]) + return "\n".join(tape) + + +def _check_alphabet(alphabet): + """ Checks the length of letters and figures (must be 32 chars). """ + for chars in alphabet: + l = len(chars) + if l != 32: + raise ValueError("Bad length of alphabet (%d instead of 32)" % l) + + +def _handle_alphabet(alphabet): + """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ + alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") + if "_lsb" in alphabet: + alphabet = alphabet.replace("_lsb", "") + func = lambda x: x[::-1] + else: + alphabet = alphabet.replace("_msb", "") + func = lambda x: x + _ = globals()[alphabet.upper()] + st, a = _[:2], _[2:] + _check_alphabet(a) + alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ + zip(["letters", "figures"], a)} + return alphabet, {'letters': st[0], 'figures': st[1]}, func + + +def baudot_encode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + def encode(text, errors="strict"): + text = text.upper() + s, l, state, seen_states = "", len(b(text)), None, [] + for i, c in enumerate(text): + # if the state is undefined yet, find the relevant alphabet + if state is None: + bits= None + for st in states.keys(): + try: + bits = func(alphabet[st][c]) + state = st + if st not in seen_states: + seen_states.append(st) + break + except KeyError: + pass + if bits is None: + bits = handle_error(ename, errors, "?", 5)(c, i) + s += bits + # otherwise, handle state change (when the current alphabet does not contain the character to encode but the + # other alphabet does + else: + try: + s += func(alphabet[state][c]) + continue + except KeyError: + state = list(set(states.keys()) - {state})[0] + try: + s += func(states[state]) + func(alphabet[state][c]) + if state not in seen_states: + seen_states.append(state) + except KeyError as e: + state = list(set(states.keys()) - {state})[0] # reset the state + s += handle_error(ename, errors, "?", 5)(c, i) + # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, + # it is necessary to include the groups of bits for figures at the beginning of the encoded string + s = (states['figures'] if seen_states == ["figures"] else "") + s + if spaced: + s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) + elif tape: + s = _bits_to_tape(s) + return s, l + return encode + + +def baudot_decode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} + states = {v: k for k, v in states.items()} + def decode(text, errors="strict"): + s, l = "", len(b(text)) + if spaced: + text = text.replace(" ", "") + elif tape: + text = _bits_from_tape(text) + # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; + # by default, we assume letters + state = "letters" + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching + # a state change + if bits in states.keys(): + error = False + # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid + # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a + # valid state change and not simply a character, and we can set it as the starting state + for j in range(i-5, 0, -5): + try: + alphabet[states[bits]][text[j:j+5]] + except KeyError: + error = True + break + if error: + state = list(set(states.values()) - {states[bits]})[0] + break + # now parse the input text + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + try: + s += alphabet[state][bits] + except KeyError: + if bits in states.keys() and states[bits] != state: + state = states[bits] + else: + s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) + return s, l + return decode + + +add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], + entropy=1., printables_rate=1.) + + +baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) +baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) +add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, + guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) + + +baudot_tape_encode = lambda a: baudot_encode(a, tape=True) +baudot_tape_decode = lambda a: baudot_decode(a, tape=True) +add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, + guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) + diff --git a/src/codext/binary/rotate.py b/src/codext/binary/rotate.py index 944e2b2..fb0c697 100755 --- a/src/codext/binary/rotate.py +++ b/src/codext/binary/rotate.py @@ -1,52 +1,51 @@ -# -*- coding: UTF-8 -*- -"""Rotate-Bits Codec - rotate-N-bits content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rotate-0|rotate-8|rotate-left-8)': None, - 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, - 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, -} -__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] - - -if PY3: - def _getn(i): - m = 1 - if str(i).startswith("left"): - i = i[4:].lstrip("-_") - m = -1 - return m * int(i) - - - def _rotaten(text, n=1): - r = "" - for c in ensure_str(text): - b = bin(ord(c))[2:].zfill(8) - r += chr(int(b[-n:] + b[:-n], 2)) - return r - - - def rotate_encode(i): - def encode(text, errors="strict"): - return _rotaten(text, _getn(i)), len(text) - return encode - - - def rotate_decode(i): - def decode(text, errors="strict"): - return _rotaten(text, -_getn(i)), len(text) - return decode - - - add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""Rotate-Bits Codec - rotate-N-bits content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rotate-0|rotate-8|rotate-left-8)': None, + 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, + 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, +} +__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] + + +def _getn(i): + m = 1 + if str(i).startswith("left"): + i = i[4:].lstrip("-_") + m = -1 + return m * int(i) + + +def _rotaten(text, n=1): + r = "" + for c in ensure_str(text): + b = bin(ord(c))[2:].zfill(8) + r += chr(int(b[-n:] + b[:-n], 2)) + return r + + +def rotate_encode(i): + def encode(text, errors="strict"): + return _rotaten(text, _getn(i)), len(text) + return encode + + +def rotate_decode(i): + def decode(text, errors="strict"): + return _rotaten(text, -_getn(i)), len(text) + return decode + + +add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", + transitive=True) + diff --git a/src/codext/common/cases.py b/src/codext/common/cases.py index 8aa87e4..2f91ada 100644 --- a/src/codext/common/cases.py +++ b/src/codext/common/cases.py @@ -27,11 +27,12 @@ add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) -add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") +add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|(?:dash|kebab)(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") +add("screamingsnakecase", lambda i, e="strict": slugify(i, e, "_").upper(), None, r"^screaming[-_]snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) +add("swapcase", swapcase, swapcase, r"^(?:(?:flip|swap)(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) diff --git a/src/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py index 47d9cd5..35ec94e 100755 --- a/src/codext/compressions/pkzip.py +++ b/src/codext/compressions/pkzip.py @@ -1,56 +1,55 @@ -# -*- coding: UTF-8 -*- -"""Pkzip Codec - pkzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zipfile - -from ..__common__ import * - - -_str = ["test", "This is a test", "@random{512,1024,2048}"] -__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} -__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} -__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} - - -if PY3: - NULL = { - 8: b"\x03\x00", - 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", - 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", - } - - - def pkzip_encode(compression_type): - def _encode(text, errors="strict"): - c = zipfile._get_compressor(compression_type) - return c.compress(b(text)) + c.flush(), len(text) - return _encode - - - def pkzip_decode(compression_type, name): - def _decode(data, errors="strict"): - d = zipfile._get_decompressor(compression_type) - r = d.decompress(b(data)) - if len(r) == 0 and b(data) != NULL[compression_type]: - return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - return _decode - - - add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - examples=__examples1__, guess=["deflate"]) - - add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - examples=__examples2__, guess=["bz2"]) - - add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - examples=__examples3__, guess=["lzma"]) - +# -*- coding: UTF-8 -*- +"""Pkzip Codec - pkzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zipfile + +from ..__common__ import * + + +_str = ["test", "This is a test", "@random{512,1024,2048}"] +__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} +__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} +__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} + + +NULL = { + 8: b"\x03\x00", + 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", + 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", +} + + +def pkzip_encode(compression_type): + def _encode(text, errors="strict"): + c = zipfile._get_compressor(compression_type) + return c.compress(b(text)) + c.flush(), len(text) + return _encode + + +def pkzip_decode(compression_type, name): + def _decode(data, errors="strict"): + d = zipfile._get_decompressor(compression_type) + r = d.decompress(b(data)) + if len(r) == 0 and b(data) != NULL[compression_type]: + return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + return _decode + + +add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", + examples=__examples1__, guess=["deflate"]) + +add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", + examples=__examples2__, guess=["bz2"]) + +add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", + examples=__examples3__, guess=["lzma"]) + diff --git a/src/codext/crypto/railfence.py b/src/codext/crypto/railfence.py index 3d150c0..a25f27a 100644 --- a/src/codext/crypto/railfence.py +++ b/src/codext/crypto/railfence.py @@ -1,96 +1,96 @@ -# -*- coding: UTF-8 -*- -"""Rail Fence Cipher Codec - rail fence content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rail_123|rail-2-123)': {'this is a test': None}, - 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, - 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, - 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, - 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, - 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, - 'dec(zigzag)': {'': ""}, -} -__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] - - -def __build(text, rails, offset, up): - l, rail = len(text), offset - # set the starting rail and direction - if up: - dr = -1 - rail = rails - offset - 1 - else: - dr = 1 - # create rails - f = [[None] * l for i in range(rails)] - # now zig-zag between rails - for x in range(l): - f[rail][x] = text[x] - if rail >= rails - 1: - dr = -1 - elif rail <= 0: - dr = 1 - rail += dr - return f - - -def __check(length, rails, offset): - if rails > length: - raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be >%d)" % (rails, length)) - if offset > rails: - raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be >%d)" % (offset, rails)) - - -def railfence_encode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def encode(text, errors="strict"): - r, l = "", len(text) - __check(l, rails, offset) - f = __build(text, rails, offset, up) - for rail in range(rails): - for x in range(l): - if f[rail][x] is not None: - r += f[rail][x] - return r, l - return encode - - -def railfence_decode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def decode(text, errors="strict"): - # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py - # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not - # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with - # a rails parameter > 0 (see the __check(length, rails, offset)) function - if text == "": - return "", 0 - r, i, l = "", 0, len(text) - __check(l, rails, offset) - f = __build("." * len(text), rails, offset, up) - # put the characters in the right place - for rail in range(rails): - for x in range(l): - if f[rail][x] == ".": - f[rail][x] = text[i] - i += 1 - # read the characters in the right order - for x in range(l): - for rail in range(rails): - if f[rail][x] is not None: - r += f[rail][x] - return r, len(text) - return decode - - -add("railfence", railfence_encode, railfence_decode, - r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") - +# -*- coding: UTF-8 -*- +"""Rail Fence Cipher Codec - rail fence content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rail_123|rail-2-123)': {'this is a test': None}, + 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, + 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, + 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, + 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, + 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, + 'dec(zigzag)': {'': ""}, +} +__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] + + +def __build(text, rails, offset, up): + l, rail = len(text), offset + # set the starting rail and direction + if up: + dr = -1 + rail = rails - offset - 1 + else: + dr = 1 + # create rails + f = [[None] * l for i in range(rails)] + # now zig-zag between rails + for x in range(l): + f[rail][x] = text[x] + if rail >= rails - 1: + dr = -1 + elif rail <= 0: + dr = 1 + rail += dr + return f + + +def __check(length, rails, offset): + if rails > length: + raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be <= %d)" % (rails, length)) + if offset > rails: + raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be <= %d)" % (offset, rails)) + + +def railfence_encode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def encode(text, errors="strict"): + r, l = "", len(text) + __check(l, rails, offset) + f = __build(text, rails, offset, up) + for rail in range(rails): + for x in range(l): + if f[rail][x] is not None: + r += f[rail][x] + return r, l + return encode + + +def railfence_decode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def decode(text, errors="strict"): + # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py + # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not + # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with + # a rails parameter > 0 (see the __check(length, rails, offset)) function + if text == "": + return "", 0 + r, i, l = "", 0, len(text) + __check(l, rails, offset) + f = __build("." * len(text), rails, offset, up) + # put the characters in the right place + for rail in range(rails): + for x in range(l): + if f[rail][x] == ".": + f[rail][x] = text[i] + i += 1 + # read the characters in the right order + for x in range(l): + for rail in range(rails): + if f[rail][x] is not None: + r += f[rail][x] + return r, len(text) + return decode + + +add("railfence", railfence_encode, railfence_decode, + r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") + diff --git a/src/codext/hashing/blake.py b/src/codext/hashing/blake.py index 2fad090..6656c46 100644 --- a/src/codext/hashing/blake.py +++ b/src/codext/hashing/blake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def blake_hash(c): + def _hash_transform(l): + l = (l or "64" if c == "b" else "32").lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) + return _encode + return _hash_transform -if PY3: - def blake_hash(c): - def _hash_transform(l): - l = (l or "64" if c == "b" else "32").lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) - return _encode - return _hash_transform - add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) - add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) +add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) +add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index caf8290..0d44d8e 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -8,10 +8,10 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -from ..__common__ import add, ensure_str, PY3, UNIX +from ..__common__ import add, ensure_str, UNIX -if PY3 and UNIX: +if UNIX: import crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 6463722..521a01c 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b +from ..__common__ import * MD2_TABLE = [41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 19, 98, 167, 5, 243, 192, 199, 115, 140, diff --git a/src/codext/hashing/sha.py b/src/codext/hashing/sha.py index dd94002..1351fe8 100644 --- a/src/codext/hashing/sha.py +++ b/src/codext/hashing/sha.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b, PY3 +from ..__common__ import * add("sha1", lambda s, error="strict": (hashlib.sha1(b(s)).hexdigest(), len(s)), guess=None) @@ -18,15 +16,12 @@ add("sha256", lambda s, error="strict": (hashlib.sha256(b(s)).hexdigest(), len(s)), guess=None) add("sha384", lambda s, error="strict": (hashlib.sha384(b(s)).hexdigest(), len(s)), guess=None) add("sha512", lambda s, error="strict": (hashlib.sha512(b(s)).hexdigest(), len(s)), guess=None) - - -if PY3: - add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", - guess=None) - add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", - guess=None) - add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", - guess=None) - add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", - guess=None) +add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", + guess=None) +add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", + guess=None) +add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", + guess=None) +add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", + guess=None) diff --git a/src/codext/hashing/shake.py b/src/codext/hashing/shake.py index af79dce..22c7b99 100644 --- a/src/codext/hashing/shake.py +++ b/src/codext/hashing/shake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def shake_hash(i): + def _hash_transform(l): + l = (l or str(i)).lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) + return _encode + return _hash_transform -if PY3: - def shake_hash(i): - def _hash_transform(l): - l = (l or str(i)).lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) - return _encode - return _hash_transform - add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) - add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) +add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) +add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) diff --git a/src/codext/languages/braille.py b/src/codext/languages/braille.py index b28c56e..775399c 100755 --- a/src/codext/languages/braille.py +++ b/src/codext/languages/braille.py @@ -1,34 +1,33 @@ -# -*- coding: UTF-8 -*- -"""Braille Codec - braille content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, -} - - -ENCMAP = { - # digits - '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', - # letters - 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', - 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', - 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', - # punctuation - ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', - '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', - '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', -} - - -if PY3: - add_map("braille", ENCMAP, ignore_case="encode") - +# -*- coding: UTF-8 -*- +"""Braille Codec - braille content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, +} + + +ENCMAP = { + # digits + '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', + # letters + 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', + 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', + 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', + # punctuation + ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', + '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', + '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', +} + + +add_map("braille", ENCMAP, ignore_case="encode") + diff --git a/src/codext/languages/galactic.py b/src/codext/languages/galactic.py index e77cb3a..26544b5 100644 --- a/src/codext/languages/galactic.py +++ b/src/codext/languages/galactic.py @@ -29,7 +29,6 @@ } -if PY3: - add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., - pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") +add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., + pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") diff --git a/src/codext/languages/tap.py b/src/codext/languages/tap.py index efd551d..ec7c15b 100644 --- a/src/codext/languages/tap.py +++ b/src/codext/languages/tap.py @@ -1,39 +1,38 @@ -# -*- coding: UTF-8 -*- -"""Tap code - Tap/knock code encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." - "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, -} -__guess__ = ["tap", "tap-inv"] - - -def __build_encmap(a): - d, i = {}, 0 - for x in range(1,6): - for y in range(1,6): - d[a[i]] = x * "." + " " + y * "." - i += 1 - d['k'], d[' '] = d['c'], " " - return d - - - -ENCMAP = { - '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), - 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), -} - - -if PY3: - add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") - +# -*- coding: UTF-8 -*- +"""Tap code - Tap/knock code encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." + "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, +} +__guess__ = ["tap", "tap-inv"] + + +def __build_encmap(a): + d, i = {}, 0 + for x in range(1,6): + for y in range(1,6): + d[a[i]] = x * "." + " " + y * "." + i += 1 + d['k'], d[' '] = d['c'], " " + return d + + + +ENCMAP = { + '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), + 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), +} + + +add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") + diff --git a/src/codext/others/uuencode.py b/src/codext/others/uuencode.py index a2f2fb6..f1ecfc3 100644 --- a/src/codext/others/uuencode.py +++ b/src/codext/others/uuencode.py @@ -17,7 +17,7 @@ 'dec(uu-encode)': {'.=&AI': "<This is a test>"}, - 'dec(html|html_entity)': {'&DoesNotExist;': None}, - 'dec(html_entities|html-entity)': { - '<This is a test>': "", - '<This is a test>': "", - }, -} -if PY3: - __examples__['enc(html)'] = {'\u1234': "&1234;"} - - -# source: https://dev.w3.org/html5/html-author/charref -ENCMAP = { - '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", - '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", - '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", - '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", - '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", - '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", - 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", - '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", - '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", - '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", - 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", - 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", - 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", - 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", - 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", - 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", - 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", - '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", - 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", - 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", - 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", - 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", - 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", - 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", - 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", - 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", - 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", - 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", - 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", - 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", - 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", - 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", - 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", - 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", - 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", - 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", - 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", - 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", - '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", - 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", - 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", - 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", - 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", - 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", - 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", - 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", - 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", - 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", - 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", - 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", - 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", - 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", - 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", - 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", - 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", - 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", - 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", - 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", - 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", - 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", - '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", - '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", - '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", - '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", - '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", - '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", - '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", - '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", - '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", - 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", - 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", - 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", - 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", - 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", - 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", - 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", - '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", - '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", - '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", - '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", - '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", - '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", - '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", - '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", - '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", - '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", - '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", - '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", - '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", - '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", - '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", - '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", - '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", - '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", - '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", - '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", - '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", - '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", - '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", - '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", - '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", - '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", - '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", - '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", - '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", - '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", - '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", - '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", - '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", - '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", - '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", - '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", - '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", - '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", - '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", - '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", - '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", - '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", - '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", - '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", - '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", - '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", - '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", - '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", - '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", - '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", - '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", - '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", - '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", - '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", - '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", - '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", - '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", - '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", - '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", - '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", - '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", - '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", - '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", - '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", - '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", - '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", - '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", - '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", - '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", - '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", - '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", - '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", - '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", - '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", - '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", - '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", - '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", - '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", - '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", - '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", - '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", - '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", - '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", - '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", - '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", - '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", - '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", - '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", - '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", - '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", - '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", - '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", - '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", - '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", - '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", - '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", - '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", - '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", - '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", - '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", - '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", - '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", - '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", - '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", - '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", - '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", - '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", - '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", - '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", - '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", - '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", - '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", - '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", - '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", - '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", - '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", - '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", - '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", - '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", - '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", - '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", - '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", - '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", - 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", - '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", - '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", - '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", - '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", - '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", - '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", - '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", - '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", - '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", - '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", - '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", - '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", - '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", - '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", - '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", - '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", - '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", - '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", -} -DECMAP = {v: k for k, v in ENCMAP.items()} - - -class HtmlEntityDecodeError(ValueError): - pass - - -def htmlentity_encode(text, errors="strict"): - s = "" - for c in text: - try: - s += ENCMAP[c] - except KeyError: - i = ord(c) - s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c - return s, len(text) - - -def htmlentity_decode(text, errors="strict"): - s = "" - i = 0 - while i < len(text): - m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) - if m: - entity = m.group() - c = unichr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ - " " if entity == " " else None - if c: - s += c - else: - try: - s += DECMAP[entity] - except KeyError: - s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) - i += len(entity) - else: - s += text[i] - i += 1 - return s, len(text) - - -add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", - extra_exceptions=["HtmlEntityDecodeError"]) - +# -*- coding: UTF-8 -*- +"""HTML entity Codec - html entity content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(html_entities|html-entity)': {'': "<This is a test>"}, + 'enc(html)': {'\u1234': "&1234;"}, + 'dec(html|html_entity)': {'&DoesNotExist;': None}, + 'dec(html_entities|html-entity)': { + '<This is a test>': "", + '<This is a test>': "", + }, +} + + +# source: https://dev.w3.org/html5/html-author/charref +ENCMAP = { + '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", + '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", + '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", + '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", + '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", + '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", + 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", + '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", + '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", + '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", + 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", + 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", + 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", + 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", + 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", + 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", + 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", + '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", + 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", + 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", + 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", + 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", + 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", + 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", + 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", + 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", + 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", + 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", + 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", + 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", + 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", + 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", + 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", + 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", + 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", + 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", + 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", + 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", + '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", + 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", + 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", + 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", + 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", + 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", + 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", + 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", + 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", + 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", + 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", + 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", + 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", + 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", + 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", + 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", + 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", + 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", + 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", + 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", + 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", + 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", + '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", + '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", + '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", + '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", + '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", + '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", + '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", + '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", + '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", + 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", + 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", + 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", + 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", + 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", + 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", + 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", + '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", + '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", + '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", + '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", + '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", + '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", + '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", + '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", + '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", + '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", + '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", + '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", + '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", + '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", + '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", + '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", + '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", + '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", + '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", + '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", + '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", + '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", + '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", + '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", + '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", + '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", + '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", + '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", + '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", + '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", + '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", + '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", + '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", + '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", + '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", + '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", + '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", + '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", + '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", + '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", + '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", + '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", + '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", + '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", + '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", + '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", + '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", + '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", + '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", + '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", + '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", + '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", + '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", + '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", + '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", + '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", + '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", + '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", + '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", + '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", + '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", + '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", + '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", + '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", + '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", + '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", + '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", + '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", + '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", + '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", + '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", + '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", + '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", + '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", + '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", + '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", + '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", + '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", + '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", + '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", + '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", + '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", + '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", + '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", + '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", + '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", + '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", + '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", + '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", + '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", + '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", + '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", + '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", + '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", + '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", + '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", + '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", + '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", + '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", + '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", + '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", + '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", + '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", + '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", + '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", + '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", + '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", + '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", + '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", + '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", + '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", + '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", + '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", + '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", + '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", + '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", + '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", + '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", + '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", + '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", + '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", + '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", + '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", + 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", + '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", + '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", + '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", + '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", + '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", + '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", + '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", + '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", + '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", + '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", + '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", + '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", + '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", + '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", + '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", + '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", + '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", + '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", +} +DECMAP = {v: k for k, v in ENCMAP.items()} + + +class HtmlEntityDecodeError(ValueError): + pass + + +def htmlentity_encode(text, errors="strict"): + s = "" + for c in text: + try: + s += ENCMAP[c] + except KeyError: + i = ord(c) + s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c + return s, len(text) + + +def htmlentity_decode(text, errors="strict"): + s = "" + i = 0 + while i < len(text): + m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) + if m: + entity = m.group() + c = chr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ + " " if entity == " " else None + if c: + s += c + else: + try: + s += DECMAP[entity] + except KeyError: + s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) + i += len(entity) + else: + s += text[i] + i += 1 + return s, len(text) + + +add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", + extra_exceptions=["HtmlEntityDecodeError"]) + diff --git a/tests/test_base.py b/tests/test_base.py index 7b3dae0..a37d1a6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,236 +1,235 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Base codecs tests. - -""" -import os -import sys -from unittest import TestCase - -from codext.__common__ import * -from codext.base._base import _generate_charset -from codext.base.baseN import base, main2, main32, main64url - - -class TestCodecsBase(TestCase): - def setUp(self): - global STR - STR = "this is a test" - - def test_new_base_codec(self): - for i in [0, 1, 256]: - self.assertRaises(ValueError, _generate_charset, i) - b10 = lambda *a: "0123456789" - base(b10, "base10") - B10 = "2361031878030638688519054699098996" - self.assertEqual(codecs.encode(STR, "base10"), B10) - self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) - self.assertEqual(codecs.decode(B10, "base10"), STR) - self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) - self.assertRaises(ValueError, base, 1, "test") - b11 = "0123456789a" - base(b11, "base11") - B11 = "113342054335735319526632a26972419" - self.assertEqual(codecs.encode(STR, "base11"), B11) - self.assertEqual(codecs.decode(B11, "base11"), STR) - self.assertRaises(ValueError, base, object(), "test") - self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) - self.assertIsNotNone(codecs.encode(STR, "base5test")) - self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) - self.assertEqual("", codecs.decode("", "base5test")) - - def test_codec_base1(self): - C = "A" - for i in range(3): - self.assertIsNotNone(codecs.encode(i * C, "base1")) - self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") - self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") - - def test_codec_base2(self): - STR = "test" - B2 = "01110100011001010111001101110100" - self.assertEqual(codecs.encode(STR, "base2"), B2) - self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) - self.assertEqual(codecs.decode(B2, "base2"), STR) - self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) - B2 = "10001011100110101000110010001011" - self.assertEqual(codecs.encode(STR, "base2-inv"), B2) - self.assertEqual(codecs.decode(B2, "base2-inv"), STR) - B2 = "abbbabaaabbaabababbbaabbabbbabaa" - self.assertEqual(codecs.encode(STR, "base2-ab"), B2) - self.assertEqual(codecs.decode(B2, "base2-ab"), STR) - B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" - self.assertEqual(codecs.encode(STR, "base2-CD"), B2) - self.assertEqual(codecs.decode(B2, "base2-CD"), STR) - B2 = "34443433344334343444334434443433" - self.assertEqual(codecs.encode(STR, "base2-34"), B2) - self.assertEqual(codecs.decode(B2, "base2-34"), STR) - - def test_codec_base3(self): - STR = "test" - B3 = "23112113223321323322" - self.assertEqual(codecs.encode(STR, "base3"), B3) - self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) - self.assertEqual(codecs.decode(B3, "base3"), STR) - self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) - B3 = "21332331221123121122" - self.assertEqual(codecs.encode(STR, "base3-inv"), B3) - self.assertEqual(codecs.decode(B3, "base3-inv"), STR) - B3 = "bcaabaacbbccbacbccbb" - self.assertEqual(codecs.encode(STR, "base3-abc"), B3) - self.assertEqual(codecs.decode(B3, "base3-abc"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") - self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") - - def test_codec_base4(self): - STR = "test" - B4 = "2421232224142421" - self.assertEqual(codecs.encode(STR, "base4"), B4) - self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) - self.assertEqual(codecs.decode(B4, "base4"), STR) - self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) - B4 = "3134323331413134" - self.assertEqual(codecs.encode(STR, "base4-inv"), B4) - self.assertEqual(codecs.decode(B4, "base4-inv"), STR) - B4 = "bdbabcbbbdadbdba" - self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) - self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") - self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") - - def test_codec_base8(self): - STR = "test" - B8 = "dfagcfgddfa=====" - self.assertEqual(codecs.encode(STR, "base8"), B8) - self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) - self.assertEqual(codecs.decode(B8, "base8"), STR) - self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) - B8 = "echbfcbeech=====" - self.assertEqual(codecs.encode(STR, "base8-inv"), B8) - self.assertEqual(codecs.decode(B8, "base8-inv"), STR) - B8 = "35062563350=====" - self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) - self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") - self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") - - def test_codec_base16(self): - B16 = "7468697320697320612074657374" - self.assertEqual(codecs.encode(STR, "base16"), B16) - self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) - self.assertEqual(codecs.decode(B16, "base16"), STR) - self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) - B16 += "?" - self.assertRaises(ValueError, codecs.decode, B16, "base16") - self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) - self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") - self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") - STR2 = "=:;" - B16_1 = "3d3a3b" - B16_2 = "3D3A3B" - B16_3 = "3D3a3B" # mixed case: should fail - self.assertEqual(codecs.encode(STR2, "hex"), B16_2) - self.assertEqual(codecs.decode(B16_1, "hex"), STR2) - self.assertEqual(codecs.decode(B16_2, "hex"), STR2) - self.assertRaises(ValueError, codecs.decode, B16_3, "hex") - - def test_codec_base32(self): - for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", - "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], - ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): - self.assertEqual(codecs.encode(STR, enc), b32) - self.assertEqual(codecs.encode(b(STR), enc), b(b32)) - self.assertEqual(codecs.decode(b32, enc), STR) - self.assertEqual(codecs.decode(b(b32), enc), b(STR)) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") - - def test_codec_base36(self): - B36 = "4WMHTK6UZL044O91NKCEB8" - self.assertEqual(codecs.encode(STR, "base36"), B36) - self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) - self.assertEqual(codecs.decode(B36, "base36"), STR) - self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) - B36 = "E6WR3UG49VAEEYJBXUMOLI" - self.assertEqual(codecs.encode(STR, "base36-inv"), B36) - self.assertEqual(codecs.decode(B36, "base36-inv"), STR) - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") - self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) - - def test_codec_base58(self): - B58 = "jo91waLQA1NNeBmZKUF" - self.assertEqual(codecs.encode(STR, "base58"), B58) - self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) - self.assertEqual(codecs.decode(B58, "base58"), STR) - self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) - B58 = "jo9rA2LQwr44eBmZK7E" - self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) - self.assertEqual(codecs.decode(B58, "base58-rp"), STR) - B58 = "JN91Wzkpa1nnDbLyjtf" - self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) - self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) - self.assertEqual(codecs.decode(B58, "base58-fl"), STR) - self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) - self.assertEqual(codecs.encode(STR, "base58-url"), B58) - - def test_codec_base62(self): - for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): - self.assertEqual(codecs.encode(STR, enc), b62) - self.assertEqual(codecs.encode(b(STR), enc), b(b62)) - self.assertEqual(codecs.decode(b62, enc), STR) - self.assertEqual(codecs.decode(b(b62), enc), b(STR)) - - def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): - self.assertEqual(codecs.encode(STR, enc), b64) - self.assertEqual(codecs.encode(b(STR), enc), b(b64)) - self.assertEqual(codecs.decode(b64, enc), STR) - self.assertEqual(codecs.decode(b(b64), enc), b(STR)) - - def test_codec_base91(self): - for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], - ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): - self.assertEqual(codecs.encode(STR, enc), b91) - self.assertEqual(codecs.encode(b(STR), enc), b(b91)) - self.assertEqual(codecs.decode(b91, enc), STR) - self.assertEqual(codecs.decode(b(b91), enc), b(STR)) - self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) - self.assertIsNotNone(codecs.decode("abc", "base91")) - self.assertIsNotNone(codecs.decode("AD", "base91")) - self.assertRaises(ValueError, codecs.decode, "\xff", "base91") - self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") - self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) - - def test_codec_base100(self): - if PY3: - B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458" \ - "\U0001f417\U0001f46b\U0001f45c\U0001f46a\U0001f46b" - self.assertEqual(codecs.encode(STR, "base100"), B100) - self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) - self.assertEqual(codecs.decode(B100, "base100"), STR) - self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) - self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") - - def test_codec_base_generic(self): - for n in range(2, 255): - bn = "base{}_generic".format(n) - self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) - self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") - - def test_base_main(self): - tmp = sys.argv[:] - tfile = "test-base-main.txt" - with open(tfile, 'w') as f: - f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - for swap_arg in [[], ["-s"]]: - sys.argv = [tmp[0], tfile] + swap_arg - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] + swap_arg - self.assertEqual(main2(), 1) - os.remove(tfile) - sys.argv[:] = tmp - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Base codecs tests. + +""" +import sys +from unittest import TestCase + +from codext.__common__ import * +from codext.base._base import _generate_charset +from codext.base.baseN import base, main2, main32, main64url + + +class TestCodecsBase(TestCase): + def setUp(self): + global STR + STR = "this is a test" + + def test_new_base_codec(self): + for i in [0, 1, 256]: + self.assertRaises(ValueError, _generate_charset, i) + b10 = lambda *a: "0123456789" + base(b10, "base10") + B10 = "2361031878030638688519054699098996" + self.assertEqual(codecs.encode(STR, "base10"), B10) + self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) + self.assertEqual(codecs.decode(B10, "base10"), STR) + self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) + self.assertRaises(ValueError, base, 1, "test") + b11 = "0123456789a" + base(b11, "base11") + B11 = "113342054335735319526632a26972419" + self.assertEqual(codecs.encode(STR, "base11"), B11) + self.assertEqual(codecs.decode(B11, "base11"), STR) + self.assertRaises(ValueError, base, object(), "test") + self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) + self.assertIsNotNone(codecs.encode(STR, "base5test")) + self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) + self.assertEqual("", codecs.decode("", "base5test")) + + def test_codec_base1(self): + C = "A" + for i in range(3): + self.assertIsNotNone(codecs.encode(i * C, "base1")) + self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") + + def test_codec_base2(self): + STR = "test" + B2 = "01110100011001010111001101110100" + self.assertEqual(codecs.encode(STR, "base2"), B2) + self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) + self.assertEqual(codecs.decode(B2, "base2"), STR) + self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) + B2 = "10001011100110101000110010001011" + self.assertEqual(codecs.encode(STR, "base2-inv"), B2) + self.assertEqual(codecs.decode(B2, "base2-inv"), STR) + B2 = "abbbabaaabbaabababbbaabbabbbabaa" + self.assertEqual(codecs.encode(STR, "base2-ab"), B2) + self.assertEqual(codecs.decode(B2, "base2-ab"), STR) + B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" + self.assertEqual(codecs.encode(STR, "base2-CD"), B2) + self.assertEqual(codecs.decode(B2, "base2-CD"), STR) + B2 = "34443433344334343444334434443433" + self.assertEqual(codecs.encode(STR, "base2-34"), B2) + self.assertEqual(codecs.decode(B2, "base2-34"), STR) + + def test_codec_base3(self): + STR = "test" + B3 = "23112113223321323322" + self.assertEqual(codecs.encode(STR, "base3"), B3) + self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) + self.assertEqual(codecs.decode(B3, "base3"), STR) + self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) + B3 = "21332331221123121122" + self.assertEqual(codecs.encode(STR, "base3-inv"), B3) + self.assertEqual(codecs.decode(B3, "base3-inv"), STR) + B3 = "bcaabaacbbccbacbccbb" + self.assertEqual(codecs.encode(STR, "base3-abc"), B3) + self.assertEqual(codecs.decode(B3, "base3-abc"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") + self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") + + def test_codec_base4(self): + STR = "test" + B4 = "2421232224142421" + self.assertEqual(codecs.encode(STR, "base4"), B4) + self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) + self.assertEqual(codecs.decode(B4, "base4"), STR) + self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) + B4 = "3134323331413134" + self.assertEqual(codecs.encode(STR, "base4-inv"), B4) + self.assertEqual(codecs.decode(B4, "base4-inv"), STR) + B4 = "bdbabcbbbdadbdba" + self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) + self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") + self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") + + def test_codec_base8(self): + STR = "test" + B8 = "dfagcfgddfa=====" + self.assertEqual(codecs.encode(STR, "base8"), B8) + self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) + self.assertEqual(codecs.decode(B8, "base8"), STR) + self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) + B8 = "echbfcbeech=====" + self.assertEqual(codecs.encode(STR, "base8-inv"), B8) + self.assertEqual(codecs.decode(B8, "base8-inv"), STR) + B8 = "35062563350=====" + self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) + self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") + self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") + + def test_codec_base16(self): + B16 = "7468697320697320612074657374" + self.assertEqual(codecs.encode(STR, "base16"), B16) + self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) + self.assertEqual(codecs.decode(B16, "base16"), STR) + self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) + B16 += "?" + self.assertRaises(ValueError, codecs.decode, B16, "base16") + self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) + self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") + self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") + STR2 = "=:;" + B16_1 = "3d3a3b" + B16_2 = "3D3A3B" + B16_3 = "3D3a3B" # mixed case: should fail + self.assertEqual(codecs.encode(STR2, "hex"), B16_2) + self.assertEqual(codecs.decode(B16_1, "hex"), STR2) + self.assertEqual(codecs.decode(B16_2, "hex"), STR2) + self.assertRaises(ValueError, codecs.decode, B16_3, "hex") + + def test_codec_base32(self): + for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", + "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], + ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): + self.assertEqual(codecs.encode(STR, enc), b32) + self.assertEqual(codecs.encode(b(STR), enc), b(b32)) + self.assertEqual(codecs.decode(b32, enc), STR) + self.assertEqual(codecs.decode(b(b32), enc), b(STR)) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") + + def test_codec_base36(self): + B36 = "4WMHTK6UZL044O91NKCEB8" + self.assertEqual(codecs.encode(STR, "base36"), B36) + self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) + self.assertEqual(codecs.decode(B36, "base36"), STR) + self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) + B36 = "E6WR3UG49VAEEYJBXUMOLI" + self.assertEqual(codecs.encode(STR, "base36-inv"), B36) + self.assertEqual(codecs.decode(B36, "base36-inv"), STR) + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") + self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) + + def test_codec_base58(self): + B58 = "jo91waLQA1NNeBmZKUF" + self.assertEqual(codecs.encode(STR, "base58"), B58) + self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) + self.assertEqual(codecs.decode(B58, "base58"), STR) + self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) + B58 = "jo9rA2LQwr44eBmZK7E" + self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) + self.assertEqual(codecs.decode(B58, "base58-rp"), STR) + B58 = "JN91Wzkpa1nnDbLyjtf" + self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) + self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) + self.assertEqual(codecs.decode(B58, "base58-fl"), STR) + self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) + self.assertEqual(codecs.encode(STR, "base58-url"), B58) + + def test_codec_base62(self): + for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): + self.assertEqual(codecs.encode(STR, enc), b62) + self.assertEqual(codecs.encode(b(STR), enc), b(b62)) + self.assertEqual(codecs.decode(b62, enc), STR) + self.assertEqual(codecs.decode(b(b62), enc), b(STR)) + + def test_codec_base64(self): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + self.assertEqual(codecs.encode(STR, enc), b64) + self.assertEqual(codecs.encode(b(STR), enc), b(b64)) + self.assertEqual(codecs.decode(b64, enc), STR) + self.assertEqual(codecs.decode(b(b64), enc), b(STR)) + + def test_codec_base91(self): + for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], + ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): + self.assertEqual(codecs.encode(STR, enc), b91) + self.assertEqual(codecs.encode(b(STR), enc), b(b91)) + self.assertEqual(codecs.decode(b91, enc), STR) + self.assertEqual(codecs.decode(b(b91), enc), b(STR)) + self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) + self.assertIsNotNone(codecs.decode("abc", "base91")) + self.assertIsNotNone(codecs.decode("AD", "base91")) + self.assertRaises(ValueError, codecs.decode, "\xff", "base91") + self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") + self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) + + def test_codec_base100(self): + B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458\U0001f417" \ + "\U0001f46b\U0001f45c\U0001f46a\U0001f46b" + self.assertEqual(codecs.encode(STR, "base100"), B100) + self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) + self.assertEqual(codecs.decode(B100, "base100"), STR) + self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) + self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") + self.assertIsNotNone(codecs.decode(b(B100) + b"\n", "base100", "ignore")) + + def test_codec_base_generic(self): + for n in range(2, 255): + bn = "base{}_generic".format(n) + self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) + self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") + + def test_base_main(self): + tmp = sys.argv[:] + tfile = "test-base-main.txt" + with open(tfile, 'w') as f: + f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) + os.remove(tfile) + sys.argv[:] = tmp + diff --git a/tests/test_common.py b/tests/test_common.py index 8bbf410..407997c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,256 +1,237 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Codecs added assets' tests. - -""" -import codecs -import codext -import json -import random -import sys -from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE -from six import b, binary_type, text_type -from unittest import TestCase - - -PY3 = sys.version[0] == "3" - - -def dummy_encode(input, errors="strict"): - return input, len(input) - - -def dummy_decode(input, errors="strict"): - return input, len(input) - - -def dummy_errored_decode(useless): - raise AttributeError - def decode(input, errors="strict"): - return input, len(input) - return decode - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -def getregentry(encoding): - if encoding == "dummy3": - return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) - - -class TestCommon(TestCase): - def setUp(self): - codext.reset() - - def test_add_codec(self): - self.assertRaises(ValueError, codext.add, "test") - self.assertRaises(ValueError, codext.add, "test", "BAD") - self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - ci = codext.lookup("dummy") - for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: - self.assertIn(k, ci.parameters.keys()) - self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) - self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") - - def test_add_map_codec(self): - ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] - self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) - self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") - self.assertEqual(codext.encode("abc", "dummy2"), "ABC") - self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") - self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") - self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") - self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") - ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} - self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) - self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") - ci = codext.lookup("dummy2") - for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: - self.assertIn(k, ci.parameters.keys()) - - def test_list_codecs(self): - self.assertTrue(len(codext.list()) > 0) - self.assertTrue(len(codext.list("other")) > 0) - self.assertTrue(len(codext.list("native")) > 0) - self.assertTrue(len(codext.list("non-native")) > 0) - self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) - self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) - self.assertTrue(len(codext.list("~crypto")) > 0) - self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) - self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) - self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") - self.assertTrue(codext.is_native("base64_codec")) - self.assertFalse(codext.is_native("base64")) - - def test_remove_codec(self): - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - self.assertIsNone(codext.remove("dummy")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove - # it afterwards - self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.remove("dummy2")) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.register(getregentry)) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - self.assertIsNone(codecs.remove("dummy3")) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - - def test_clear_codecs(self): - self.assertIsNotNone(codecs.encode("test", "morse")) - self.assertIsNone(codecs.clear()) - self.assertRaises(LookupError, codecs.encode, "test", "morse") - - def test_reset_codecs(self): - self.assertIsNone(codext.reset()) - self.assertIsNotNone(codext.encode("test", "morse")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - self.assertTrue(len(CODECS_OVERWRITTEN) > 0) - self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) - - def test_search_codecs(self): - self.assertIsNotNone(codext.search("morse")) - self.assertIsNotNone(codext.search("geohash")) - self.assertIsNotNone(codext.examples("morse")) - self.assertIsNotNone(codext.examples("cp")) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) - - def test_encode_multiple_rounds(self): - if PY3: - self.assertRaises(TypeError, codext.encode, b"test", "utf-8[2]") - s = "test" - for i in range(3): - s = codext.encode(s, "morse") - self.assertEqual(s, codext.encode("test", "morse[3]")) - self.assertIsNotNone(codext.encode("test", "base64[10]")) - - def test_guess_decode(self): - self.assertIsNone(codext.stopfunc._reload_lang()) - self.assertIsNotNone(codext.stopfunc._validate("flag")) - _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) - self.assertIn("test-codec", codext.list_encodings("test")) - self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, - scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], - max_depth=2).items())[0][1], "TEST") - STR = "This is a test" - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) - self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, - exclude=["base100"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) - self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, - show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) - self.assertRaises(ValueError, codext.guess, STR, max_depth=0) - self.assertRaises(ValueError, codext.guess, STR, exclude=42) - for c in ["base", "language", "native", "stegano"]: - e = codext.list(c) - random.shuffle(e) - for ename in e[:10]: - for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: - try: - enc = codext.encode(STR, encoding) - except (NotImplementedError, ValueError): - continue - except TypeError: - enc = codext.encode(b(STR), encoding) - if codext.decode(enc, encoding) == STR: - continue - for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], - scoring_heuristic=True, debug=True).items(): - self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) - if c != "base": - # do not check for base as the guessed encoding name can be different, e.g.: - # actual: base2 - # guessed: base2-generic - if "-icase" in encoding: - self.assertEqual(encoding.lower(), found_encodings[0].lower()) - else: - self.assertEqual(encoding, found_encodings[0]) - txt = "".join(chr(i) for i in range(256)) - b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) - self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") - - def test_rank_input(self): - codext.remove("test_codec") - self.assertRaises(LookupError, codext.encode, "TEST", "test") - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) - STR = "This is a test string !" - ENC = codext.encode(STR, "base64") - self.assertTrue(len(codext.rank(ENC)) > 20) - self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) - self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") - - def test_handle_macros(self): - MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" - STR = "this is a test" - ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" - codext.remove(MACRO) - l = codext.list_macros() - self.assertTrue(len(l) > 0) - cm = codext.lookup("example-macro") - self.assertIsNotNone(cm) - self.assertRaises(LookupError, codext.lookup, "example-macro", False) - self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") - self.assertRaises(ValueError, codext.add_macro, "base64", "base91") - self.assertIsNotNone(repr(cm)) - self.assertTrue(hasattr(cm, "parameters")) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) - self.assertIn(MACRO, codext.list_macros()) - self.assertIsNotNone(codext.encode(STR, MACRO)) - self.assertEqual(codext.decode(ENC, MACRO), STR) - # insert a bad entry for the list of encodings in the JSON file - PERS_MACROS[MACRO] = "not a list or tuple..." - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f) - codext.reset() - self.assertRaises(ValueError, codext.lookup, MACRO) - self.assertIsNone(codext.remove(MACRO)) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertNotIn(MACRO, codext.list_macros()) - self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - if PY3: - self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Codecs added assets' tests. + +""" +import codext +import json +import random +import sys +from codext.__common__ import * +from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE +from unittest import TestCase + + +def dummy_encode(input, errors="strict"): + return input, len(input) + + +def dummy_decode(input, errors="strict"): + return input, len(input) + + +def dummy_errored_decode(useless): + raise AttributeError + def decode(input, errors="strict"): + return input, len(input) + return decode + + +def getregentry(encoding): + if encoding == "dummy3": + return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) + + +class TestCommon(TestCase): + def setUp(self): + codext.reset() + + def test_add_codec(self): + self.assertRaises(ValueError, codext.add, "test") + self.assertRaises(ValueError, codext.add, "test", "BAD") + self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + ci = codext.lookup("dummy") + for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: + self.assertIn(k, ci.parameters.keys()) + self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) + self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") + + def test_add_map_codec(self): + ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] + self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) + self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") + self.assertEqual(codext.encode("abc", "dummy2"), "ABC") + self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") + self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") + self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") + self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") + ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} + self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) + self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") + ci = codext.lookup("dummy2") + for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: + self.assertIn(k, ci.parameters.keys()) + + def test_list_codecs(self): + self.assertTrue(len(codext.list()) > 0) + self.assertTrue(len(codext.list("other")) > 0) + self.assertTrue(len(codext.list("native")) > 0) + self.assertTrue(len(codext.list("non-native")) > 0) + self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) + self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) + self.assertTrue(len(codext.list("~crypto")) > 0) + self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) + self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) + self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") + self.assertTrue(codext.is_native("base64_codec")) + self.assertFalse(codext.is_native("base64")) + + def test_remove_codec(self): + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + self.assertIsNone(codext.remove("dummy")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove + # it afterwards + self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.remove("dummy2")) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.register(getregentry)) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + self.assertIsNone(codecs.remove("dummy3")) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + + def test_clear_codecs(self): + self.assertIsNotNone(codecs.encode("test", "morse")) + self.assertIsNone(codecs.clear()) + self.assertRaises(LookupError, codecs.encode, "test", "morse") + + def test_reset_codecs(self): + self.assertIsNone(codext.reset()) + self.assertIsNotNone(codext.encode("test", "morse")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + self.assertTrue(len(CODECS_OVERWRITTEN) > 0) + self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) + + def test_search_codecs(self): + self.assertIsNotNone(codext.search("morse")) + self.assertIsNotNone(codext.search("geohash")) + self.assertIsNotNone(codext.examples("morse")) + self.assertIsNotNone(codext.examples("cp")) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) + + def test_encode_multiple_rounds(self): + s = "test" + for i in range(3): + s = codext.encode(s, "morse") + self.assertEqual(s, codext.encode("test", "morse[3]")) + self.assertIsNotNone(codext.encode("test", "base64[10]")) + + def test_guess_decode(self): + self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) + _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + self.assertIn("test-codec", codext.list_encodings("test")) + self.assertEqual(codext.decode("TEST=", "test"), "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, + scoring_heuristic=False).items())[0][1], "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], + max_depth=2).items())[0][1], "TEST") + STR = "This is a test" + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) + self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, + exclude=["base100"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) + self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, + show=True))) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + exclude=("base64", "base64-url"))), 0) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) + self.assertRaises(ValueError, codext.guess, STR, max_depth=0) + self.assertRaises(ValueError, codext.guess, STR, exclude=42) + for c in ["base", "language", "native", "stegano"]: + e = codext.list(c) + random.shuffle(e) + for ename in e[:10]: + for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: + try: + enc = codext.encode(STR, encoding) + except (NotImplementedError, ValueError): + continue + except TypeError: + enc = codext.encode(b(STR), encoding) + if codext.decode(enc, encoding) == STR: + continue + for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], + scoring_heuristic=True, debug=True).items(): + self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) + if c != "base": + # do not check for base as the guessed encoding name can be different, e.g.: + # actual: base2 + # guessed: base2-generic + if "-icase" in encoding: + self.assertEqual(encoding.lower(), found_encodings[0].lower()) + else: + self.assertEqual(encoding, found_encodings[0]) + txt = "".join(chr(i) for i in range(256)) + b64 = codext.encode(txt, "base64") + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) + self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") + + def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) + STR = "This is a test string !" + ENC = codext.encode(STR, "base64") + self.assertTrue(len(codext.rank(ENC)) > 20) + self.assertEqual(len(codext.rank(ENC, limit=20)), 20) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") + + def test_handle_macros(self): + MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" + STR = "this is a test" + ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" + codext.remove(MACRO) + l = codext.list_macros() + self.assertTrue(len(l) > 0) + cm = codext.lookup("example-macro") + self.assertIsNotNone(cm) + self.assertRaises(LookupError, codext.lookup, "example-macro", False) + self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") + self.assertRaises(ValueError, codext.add_macro, "base64", "base91") + self.assertIsNotNone(repr(cm)) + self.assertTrue(hasattr(cm, "parameters")) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) + self.assertIn(MACRO, codext.list_macros()) + self.assertIsNotNone(codext.encode(STR, MACRO)) + self.assertEqual(codext.decode(ENC, MACRO), STR) + # insert a bad entry for the list of encodings in the JSON file + PERS_MACROS[MACRO] = "not a list or tuple..." + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f) + codext.reset() + self.assertRaises(ValueError, codext.lookup, MACRO) + self.assertIsNone(codext.remove(MACRO)) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertNotIn(MACRO, codext.list_macros()) + self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") + diff --git a/tests/test_generated.py b/tests/test_generated.py index 614562f..e8eaf10 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -1,139 +1,158 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Automatically generated codec tests. - -""" -import os -import re -from itertools import chain -from random import randint -from string import printable -from unittest import TestCase - -from codext.__common__ import * - - -def make_test(**params): - """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ - def _template(self): - tfile = "test-codec-%s.txt" % params['name'] - icase = params.get('ignore_case') - icdec = lambda s: s.lower() if icase in ["decode", "both"] else s - icenc = lambda s: s.lower() if icase in ["encode", "both"] else s - # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just - # execute the defined decode tests - dec = True - for k in params['examples'].keys(): - if k.startswith("dec"): - dec = False - # now execute tests relying on the given examples - for k, examples in params['examples'].items(): - # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' - m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)", k) - if m: - f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) - f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) - for ename in m.groups(): - if ename is None: - continue - # buggy generated encoding names - try: - lookup(ename) - except LookupError: - continue - # erroneous encoding name test - if examples is None: - self.assertRaises(LookupError, f1, "test", ename) - continue - # unhandled character error tests - encmap = params.get('encmap') - if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: - if not isinstance(encmap, list): - encmap = [encmap] - for em in encmap: - if k.startswith("dec"): - em = {v: k for k, v in em.items()} - # find one handled character and one unhandled - c1, c2 = None, None - p = list(map(ord, printable)) - for i in chain(p, set(range(256)) - set(p)): - if chr(i) in em.keys(): - c1 = chr(i) - break - for i in chain(set(range(256)) - set(p), p): - if chr(i) not in em.keys(): - c2 = chr(i) - break - # now check that it raises the right error or not given the selected errors handling - if c1 and c2: - sep = params['sep'][0] if len(params['sep']) > 0 else "" - self.assertRaises(ValueError, f1, c2, ename) - self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") - if not k.startswith("enc-dec"): - self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) - self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) - self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ - params.get('repl_minlen', 1) * params['repl_char']) - # examples validation tests - if k.startswith("enc-dec") and isinstance(examples, list): - for e in examples[:]: - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - examples.remove(e) - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - examples.append(s.lower() if rd.group(1) else s) - for s in [""] + examples: - self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) - self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s2 = f.read() if PY3 else f.read().rstrip("\x00") - self.assertEqual(b(icdec(s2)), b(icdec(s))) - os.remove(tfile) - else: - for s1, s2 in examples.items(): - # willingly erroneous tests - if s2 is None: - self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) - continue - # raw text tests - self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) - self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) - self.assertIsNotNone(f1(s1, ename, "replace")) - self.assertIsNotNone(f1(s1, ename, "ignore")) - if dec: - self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) - self.assertIsNotNone(f2(s2, ename, "replace")) - self.assertIsNotNone(f2(s2, ename, "ignore")) - if k.startswith("enc"): - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s1)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s = f.read() - if not PY3 and re.search("[^\x00]\x00$", s): - s = s[:-1] - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) - os.remove(tfile) - return _template - - -class GeneratedTestCase(TestCase): - pass - - -for encoding in list_encodings(): - try: - ci = lookup(encoding) - except LookupError: - continue - # only consider codecs with __examples__ defined in their globals for dynamic tests generation - if ci.parameters.get('examples') is not None: - f = make_test(**ci.parameters) - f.__name__ = n = "test_" + encoding.replace("-", "_") - setattr(GeneratedTestCase, n, f) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Automatically generated codec tests. + +""" +from itertools import chain +from random import randint +from string import printable +from unittest import TestCase + +from codext.__common__ import * + + +def make_test(**params): + """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ + def _template(self): + tfile = "test-codec-%s.txt" % params['name'] + icase = params.get('ignore_case') + icdec = lambda s: s.lower() if icase in ["decode", "both"] else s + icenc = lambda s: s.lower() if icase in ["encode", "both"] else s + # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just + # execute the defined decode tests + dec = True + for k in params['examples'].keys(): + if k.startswith("dec"): + dec = False + # now execute tests relying on the given examples + for k, examples in params['examples'].items(): + # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' + m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)(\*)?", k) + if m: + f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) + f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) + for ename in m.groups(): + #FIXME + if ename == "*": + # ignore mode only + continue + if ename is None: + continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue + # erroneous encoding name test + if examples is None: + self.assertRaises(LookupError, f1, "test", ename) + continue + # unhandled character error tests + encmap = params.get('encmap') + if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: + if not isinstance(encmap, list): + encmap = [encmap] + for em in encmap: + if k.startswith("dec"): + em = {v: k for k, v in em.items()} + # find one handled character and one unhandled + c1, c2 = None, None + p = list(map(ord, printable)) + for i in chain(p, set(range(256)) - set(p)): + if chr(i) in em.keys(): + c1 = chr(i) + break + for i in chain(set(range(256)) - set(p), p): + if chr(i) not in em.keys(): + c2 = chr(i) + break + # now check that it raises the right error or not given the selected errors handling + if c1 and c2: + sep = params['sep'][0] if len(params['sep']) > 0 else "" + self.assertRaises(ValueError, f1, c2, ename) + self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") + if not k.startswith("enc-dec"): + self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) + self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) + self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ + params.get('repl_minlen', 1) * params['repl_char']) + # examples validation tests + incr_f1 = codecs.getincrementalencoder(ename)().encode + incr_f2 = codecs.getincrementaldecoder(ename)().decode + # - "enc-dec" tests (uses a list of values that shall remain the same after encoding and decoding, + # no matter what the encoded value is + if k.startswith("enc-dec") and isinstance(examples, list): + for e in examples[:]: + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + examples.remove(e) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) + for s in [""] + examples: + self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + # IncrementalDecoder(...).encode(...) gives str + self.assertEqual(icdec(incr_f2(icenc(incr_f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(incr_f2(icenc(incr_f1(b(s), ename)), ename)), icdec(s)) + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s2 = f.read() + self.assertEqual(b(icdec(s2)), b(icdec(s))) + os.remove(tfile) + # - "enc" and "dec" tests (uses a dictionary with the value to be encoded and the expected encoded + # value) + else: + for s1, s2 in examples.items(): + # willingly erroneous tests + if s2 is None: + self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) + continue + # raw text tests + self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) + self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + #self.assertEqual(icenc(incr_f1(s1, ename)), b(icenc(s2))) + #self.assertEqual(icenc(incr_f1(b(s1), ename)), b(icenc(s2))) + self.assertIsNotNone(f1(s1, ename, "replace")) + self.assertIsNotNone(f1(s1, ename, "ignore")) + if dec: + self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) + # important note: with respect to the original design, + # IncrementalDecoder(...).encode(...) gives str + #self.assertEqual(icdec(incr_f2(s2, ename)), icdec(s1)) + #self.assertEqual(icdec(incr_f2(b(s2), ename)), icdec(s1)) + self.assertIsNotNone(f2(s2, ename, "replace")) + self.assertIsNotNone(f2(s2, ename, "ignore")) + if k.startswith("enc"): + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s1)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s = f.read() + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) + os.remove(tfile) + return _template + + +class GeneratedTestCase(TestCase): + pass + + +for encoding in list_encodings(): + try: + ci = lookup(encoding) + except LookupError: + continue + # only consider codecs with __examples__ defined in their globals for dynamic tests generation + if ci.parameters.get('examples') is not None: + f = make_test(**ci.parameters) + f.__name__ = n = "test_" + encoding.replace("-", "_") + setattr(GeneratedTestCase, n, f) + diff --git a/tests/test_manual.py b/tests/test_manual.py index 6a1d09f..bed4884 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -1,172 +1,168 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Manual codec tests. - -""" -import hashlib -import os -import random -from six import binary_type, string_types -from unittest import TestCase - -from codext.__common__ import * -from codext.binary.baudot import _check_alphabet -from codext.hashing.checksums import CRC - - -class ComplementaryTestCase(TestCase): - def test_codec_baudot(self): - self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) - - def test_codec_dna(self): - self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") - self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") - - def test_codec_morse(self): - self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") - - def test_codec_sms(self): - self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") - - -class ManualTestCase(TestCase): - def test_codec_affine(self): - STR = "this is a test" - AFF1 = "vjkubkubcbvguv" - self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") - # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 - self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) - self.assertEqual(codecs.encode(STR, "affine"), AFF1) - self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) - self.assertEqual(codecs.decode(AFF1, "affine"), STR) - self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) - AFF2 = "ORWJdWJdidOCJO" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) - self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) - self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) - AFF3 = "QsuOcuOcecQmOQ" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) - self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) - self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) - self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") - self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) - # example of parameters that cause mapping collisions - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") - - def test_codec_atbash(self): - STR = "This is a test" - ATB1 = "Gsrh rh z gvhg" - self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) - # uses by default an alphabet with lowercase and uppercase - self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) - self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) - self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) - self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) - self.assertEqual(codecs.decode(ATB1, "atbash"), STR) - self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) - ATB2 = "N^]/a]/a a.{/." - self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) - self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) - self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) - self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) - - def test_codec_case_related_manips(self): - STR = "This is a test" - self.assertEqual(codecs.encode(STR, "lower"), "this is a test") - self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) - self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") - self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) - self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") - self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) - self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) - self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) - self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) - self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) - self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") - self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") - self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") - self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") - - def test_codec_dummy_str_manips(self): - STR = "this is a test" - self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") - self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") - self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) - self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) - self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) - self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) - self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) - self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") - self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") - - def test_codec_hash_functions(self): - STR = b"This is a test string!" - for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if PY3: - self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) - self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") - self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) - self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") - self.assertIsNotNone(codecs.encode(STR, "shake128")) - self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") - self.assertIsNotNone(codecs.encode(STR, "shake256")) - self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") - for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if UNIX: - import crypt - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - for m in METHODS: - h = "crypt-" + m - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - # CRC checks - STR = "123456789" - for n, variants in CRC.items(): - for name, params in variants.items(): - enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") - print(enc) - self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) - - def test_codec_markdown(self): - HTM = "

Test title

\n\n

Test paragraph

\n" - MD = "# Test title\n\nTest paragraph" - TFILE = "test-codec-markdown.html" - self.assertTrue(isinstance(codecs.encode(MD, "markdown"), string_types)) - self.assertTrue(not PY3 or isinstance(codecs.encode(b(MD), "markdown"), binary_type)) - self.assertEqual(codecs.encode(MD, "markdown"), HTM) - self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") - with codecs.open(TFILE, 'w', encoding="markdown") as f: - f.write(b(MD)) - with codecs.open(TFILE) as f: - s = f.read() - self.assertEqual(HTM, ensure_str(s)) - os.remove(TFILE) - - def test_codec_whitespace_after_before(self): - STR = "test" - for i in range(100): - c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), - "-+"[random.randint(0, 1)], random.randint(1, 3)) - self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) - # in this special case, the whitespaces between words cannot be encoded because: - # - ord(" ") == 32 - # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 - # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! - self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") - self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Manual codec tests. + +""" +import os +import random +from unittest import TestCase + +from codext.__common__ import * +from codext.binary.baudot import _check_alphabet +from codext.hashing.checksums import CRC + + +class ComplementaryTestCase(TestCase): + def test_codec_baudot(self): + self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) + + def test_codec_dna(self): + self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") + self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") + + def test_codec_morse(self): + self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") + + def test_codec_sms(self): + self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") + + +class ManualTestCase(TestCase): + def test_codec_affine(self): + STR = "this is a test" + AFF1 = "vjkubkubcbvguv" + self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") + # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 + self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) + self.assertEqual(codecs.encode(STR, "affine"), AFF1) + self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) + self.assertEqual(codecs.decode(AFF1, "affine"), STR) + self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) + AFF2 = "ORWJdWJdidOCJO" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) + self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) + self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) + AFF3 = "QsuOcuOcecQmOQ" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) + self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) + self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) + self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") + self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) + # example of parameters that cause mapping collisions + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") + + def test_codec_atbash(self): + STR = "This is a test" + ATB1 = "Gsrh rh z gvhg" + self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) + # uses by default an alphabet with lowercase and uppercase + self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) + self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) + self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) + self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) + self.assertEqual(codecs.decode(ATB1, "atbash"), STR) + self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) + ATB2 = "N^]/a]/a a.{/." + self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) + self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) + self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) + self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) + + def test_codec_case_related_manips(self): + STR = "This is a test" + self.assertEqual(codecs.encode(STR, "lower"), "this is a test") + self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) + self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") + self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) + self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") + self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) + self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) + self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) + self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) + self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) + self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") + self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") + self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") + self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") + + def test_codec_dummy_str_manips(self): + STR = "this is a test" + self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") + self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") + self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) + self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) + self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) + self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) + self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") + + def test_codec_hash_functions(self): + STR = b"This is a test string!" + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) + self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") + self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) + self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") + self.assertIsNotNone(codecs.encode(STR, "shake128")) + self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") + self.assertIsNotNone(codecs.encode(STR, "shake256")) + self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") + for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + if UNIX: + import crypt + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + for m in METHODS: + h = "crypt-" + m + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + # CRC checks + STR = "123456789" + for n, variants in CRC.items(): + for name, params in variants.items(): + enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") + print(enc) + self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) + + def test_codec_markdown(self): + HTM = "

Test title

\n\n

Test paragraph

\n" + MD = "# Test title\n\nTest paragraph" + TFILE = "test-codec-markdown.html" + self.assertTrue(isinstance(codecs.encode(MD, "markdown"), str)) + self.assertEqual(codecs.encode(MD, "markdown"), HTM) + self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") + with codecs.open(TFILE, 'w', encoding="markdown") as f: + f.write(b(MD)) + with codecs.open(TFILE) as f: + s = f.read() + self.assertEqual(HTM, ensure_str(s)) + os.remove(TFILE) + + def test_codec_whitespace_after_before(self): + STR = "test" + for i in range(100): + c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), + "-+"[random.randint(0, 1)], random.randint(1, 3)) + self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) + # in this special case, the whitespaces between words cannot be encoded because: + # - ord(" ") == 32 + # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 + # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! + self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") + self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) +