Arithmetic-code/helpers.py at master · yifeipet/Arithmetic-code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import numpy as np
def getfreqs(stream):
    """
    This function Takes a stream as an input. it gets the number of occurrences of each symbol
    in that stream,  store it in a Dictionary containing every symbol in the stream .

    Arguments:
        stream {list} -- [the input stream we want to compress]
    Returns:
        Dict {dictionary} -- [contains the number of occurrences of each symbol in the stream,referenced by the name of the symbol]
    """

    symbols = set(stream)   #get the unique elements
    L = len(stream)
    Dict = {}          #store each symbol with its corresponding probability

    #calculate the probability of each symbol in the stream
    for s in symbols:
        freq = stream.count(s)
        Dict[s] = freq
    return Dict

def Cumfreq(symbol, dictionary):
  """
  This Function Takes as  inputs a symbol and a dictionary containing
  all the symbols that exists in our stream and their frequencies
  and returns the cumulative frequency starting from the very
  beginning of the Dictionary until that Symbol.

  Arguments:
      symbol {[type]} -- [the symbol we want to get the cumulative frequency for]
      dictionary {[type]} -- [the dictionary that contains the frequency of all symbols]
  Returns:
      p int -- [the upper bound ,which is the sum of all frequencies from the beginning of the dictionary till the specified symbol]
  """

  P = 0
  for sym in dictionary:
      P += dictionary[sym]
      if sym == symbol:
        break
  return P


def Arithmetic_encode (stream,precision=32):
    """
    The encoder function for the arithmetic code,all of its operations are supposed to be on integers

    Arguments:
        stream {list} -- [the input stream we want to encode]

    Keyword Arguments:
        precision {int} -- [the number of precision bits used in the encoder] (default: {32})

    Returns:
        code {list} -- [the binary compressed code of the input stream]
        dic  {dictionary}  -- [it contains the dictionary used to encode the stream.It includes the number of occurrences of all symbols in the stream]
    """
    stream.append('!')# add the end of file symbol so the that the decoder knows when to stop

    StreamSize=len(stream)

    dic=getfreqs(stream) #output the the dictionary that contains the probabilities of  all symbols


    index=0

    # those numbers will be used later in the scaling and emitting step of the binary bits
    full =2**precision
    half=full//2
    quarter=half//2

    L=0 # the lower limit of the range
    H=full # the upper limit of the range
    trails=0 # a variable to trace how deep i entered at the corner case of which the low and high limits are lying in the middle of the range


    code = [] # the list that will include the compressed code

    for symbol in stream:

        freqSym=dic[symbol]          # get the frequency of the symbol
        S_high=Cumfreq(symbol,dic)   # get the higher limit of this symbol
        S_low=S_high-freqSym         # get the lower limit of this symbol

        Range=H-L                    # get the range of the code
        # rescale the limits to the new symbol,we are using integer division to guarantee that the output is integer
        H = L +  Range * S_high // StreamSize
        L = L +  Range * S_low // StreamSize

        #creating the cases for which we will emit 0 or 1 to our final code word
        while True: # the first two easy cases , if my range fully falls inside one block
            if H < half : # if the full range falls in the lower half
                code.extend([0])
                code.extend([1]*trails)
                trails=0
                # scale the lower half to be the full range
                L*=2
                H*=2
            elif L >= half:# if the full range falls in the upper half
                code.extend([1])
                code.extend([0]*trails)
                trails=0
                # scale the upper half to be the full range
                L=2*( L - half )
                H=2*( H - half )
            elif L >= quarter and H < 3*quarter: # if the range is split between the the upper half and the lower half
                trails+=1
                L=2*(L-quarter)
                H=2*(H-quarter)
            else:
                break
    # add the last bits before you exit the function
    trails+=1
    if L<=quarter:
        code.extend([0])
        code.extend([1]*trails)
    else:
        code.extend([1])
        code.extend([0]*trails)
    return code ,dic

def Arithmetic_decode (code,dic,precision=32):
    """The decoder function for the arithmetic code,all of its operations are supposed to be on integers

    Arguments:
        code {list} -- [the binary compressed code of the input stream]
        dic {dictionary} -- [it contains the dictionary used to encode the stream.It includes the number of occurrences of all symbols in the stream]

    Keyword Arguments:
        precision {int} -- [the number of precision bits used in the decoder] (default: {32})

    Returns:
       message {list} -- [the decoder output.It's supposed to be an identical copy of the input stream]
    """
    code_size=len(code)

    stream_size=sum(dic.values())#get the stream size buy summing the number of occurrences of all the symbols in the stream

    # those numbers will be used later in the scaling and emitting step of the binary bits
    full =2**precision
    half=full//2
    quarter=half//2

    L=0 # the lower limit of the range
    H=full # the upper limit of the range

    val=0 # a variable that will hold the probability which we will iterate to find the symbol using it
    indx=1 # an index to trace the place of the last bit used to find the code.This  is due to the limited precision we have,as we cannot compute the whole floating number at once

    message=[]

    while indx <= precision and indx <= code_size:# first get the exact amount of values that can be held in the precision available , the rest will be used during the code
        if code[indx-1]==1:
            val=val+2**(precision-indx)
        indx+=1
    flag=1

    while flag:# the flag will be zero if we found the end of file symbol

        for symbol in dic:  # this loop tries to find the symbol that has the range specified by the variable val

            freqSym=dic[symbol]    # get the frequency of the symbol
            S_high=Cumfreq(symbol,dic)   # get the higher limit of this symbol
            S_low=S_high-freqSym             # get the lower limit of this symbol

            Range=H-L                   # get the range of the code

            H0 = L + Range * S_high//stream_size
            L0 = L + Range * S_low //stream_size

            if  L0 <=val and val<H0:
                message.extend([symbol])
                L=L0
                H=H0
                if symbol == '!':# this condition should be satisfied at the end of the stream
                    flag=0
                break

        while True:
            if H < half : # if the range is in the lower half
                L*=2
                H*=2
                val*=2
                if indx<= code_size:
                    val+=code[indx-1]
                    indx+=1
            elif L >= half : # if the range is in the upper half
                L=2*(L-half)
                H=2*(H-half)
                val=2*(val-half)
                if indx<= code_size:
                    val+=code[indx-1]
                    indx+=1
            elif L>=quarter and H < 3*quarter: # if the lower limit is in the lower half na d the  upper range is in the upper half,then we will need to expand the range more
                L=2*(L-quarter)
                H=2*(H-quarter)
                val=2*(val-quarter)

                if indx<= code_size:# increase the precision of pur probability until we finish all the the code bits
                    val+=code[indx-1]
                    indx+=1
            else:
                break

    message.pop() # remove the end of stream symbol to get the original stream
    return message