-
-
Notifications
You must be signed in to change notification settings - Fork 10
/
lempel_ziv_complexity.py
126 lines (93 loc) · 4.44 KB
/
lempel_ziv_complexity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""Lempel-Ziv complexity for a binary sequence, in naive Python code.
- How to use it? From Python, it's easy:
>>> from lempel_ziv_complexity import lempel_ziv_complexity
>>> s = '1001111011000010'
>>> lempel_ziv_complexity(s) # 1 / 0 / 01 / 11 / 10 / 110 / 00 / 010
8
- Note: there is also a Cython-powered version, for speedup, see :download:`lempel_ziv_complexity_cython.pyx`.
- MIT Licensed, (C) 2017-2019 Lilian Besson (Naereen)
https://GitHub.com/Naereen/Lempel-Ziv_Complexity
"""
from __future__ import print_function
__author__ = "Lilian Besson (Naereen)"
__version__ = "0.2.2"
from collections import OrderedDict
def lempel_ziv_decomposition(sequence):
r""" Manual implementation of the Lempel-Ziv decomposition.
It is defined as the number of different substrings encountered as the stream is viewed from begining to the end.
As an example:
>>> s = '1001111011000010'
>>> lempel_ziv_decomposition(s) # 1 / 0 / 01 / 11 / 10 / 110 / 00 / 010
['1', '0', '01', '11', '10', '110', '00', '010']
Marking in the different substrings the sequence complexity :math:`\mathrm{Lempel-Ziv}(s) = 8`: :math:`s = 1 / 0 / 01 / 11 / 10 / 110 / 00 / 010`.
- See the page https://en.wikipedia.org/wiki/Lempel-Ziv_complexity for more details.
Other examples:
>>> lempel_ziv_decomposition('1010101010101010')
['1', '0', '10', '101', '01', '010', '1010']
>>> lempel_ziv_decomposition('1001111011000010000010')
['1', '0', '01', '11', '10', '110', '00', '010', '000']
>>> lempel_ziv_decomposition('100111101100001000001010')
['1', '0', '01', '11', '10', '110', '00', '010', '000', '0101']
- Note: it is faster to give the sequence as a string of characters, like `'10001001'`, instead of a list or a numpy array.
- Note: see this notebook for more details, comparison, benchmarks and experiments: https://Nbviewer.Jupyter.org/github/Naereen/Lempel-Ziv_Complexity/Short_study_of_the_Lempel-Ziv_complexity.ipynb
- Note: there is also a Cython-powered version, for speedup, see :download:`lempel_ziv_complexity_cython.pyx`.
"""
sub_strings = OrderedDict()
n = len(sequence)
ind = 0
inc = 1
while True:
if ind + inc > len(sequence):
break
sub_str = sequence[ind : ind + inc]
# print(sub_str, ind, inc)
if sub_str in sub_strings:
inc += 1
else:
sub_strings[sub_str] = 0
ind += inc
inc = 1
# print("Adding", sub_str)
return list(sub_strings)
def lempel_ziv_complexity(sequence):
r""" Manual implementation of the Lempel-Ziv complexity.
It is defined as the number of different substrings encountered as the stream is viewed from begining to the end.
As an example:
>>> s = '1001111011000010'
>>> lempel_ziv_complexity(s) # 1 / 0 / 01 / 11 / 10 / 110 / 00 / 010
8
Marking in the different substrings the sequence complexity :math:`\mathrm{Lempel-Ziv}(s) = 8`: :math:`s = 1 / 0 / 01 / 11 / 10 / 110 / 00 / 010`.
- See the page https://en.wikipedia.org/wiki/Lempel-Ziv_complexity for more details.
Other examples:
>>> lempel_ziv_complexity('1010101010101010') # 1, 0, 10, 101, 01, 010, 1010
7
>>> lempel_ziv_complexity('1001111011000010000010') # 1, 0, 01, 11, 10, 110, 00, 010, 000
9
>>> lempel_ziv_complexity('100111101100001000001010') # 1, 0, 01, 11, 10, 110, 00, 010, 000, 0101
10
- Note: it is faster to give the sequence as a string of characters, like `'10001001'`, instead of a list or a numpy array.
- Note: see this notebook for more details, comparison, benchmarks and experiments: https://Nbviewer.Jupyter.org/github/Naereen/Lempel-Ziv_Complexity/Short_study_of_the_Lempel-Ziv_complexity.ipynb
- Note: there is also a Cython-powered version, for speedup, see :download:`lempel_ziv_complexity_cython.pyx`.
"""
sub_strings = set()
n = len(sequence)
ind = 0
inc = 1
while True:
if ind + inc > len(sequence):
break
sub_str = sequence[ind : ind + inc]
if sub_str in sub_strings:
inc += 1
else:
sub_strings.add(sub_str)
ind += inc
inc = 1
return len(sub_strings)
# --- Debugging
if __name__ == "__main__":
# Code for debugging purposes.
from doctest import testmod
print("\nTesting automatically all the docstring written in each functions of this module :")
testmod(verbose=True)