diff --git a/README.rst b/README.rst index 4a69007..236676d 100644 --- a/README.rst +++ b/README.rst @@ -12,12 +12,18 @@ Usage ===== First train the model: + For python 2: + python gib_detect_train.py -python gib_detect_train.py + For python 3: + python gib_detect_train_py3.py Then try it on some sample input + For python 2: + python gib_detect.py -python gib_detect.py + For python 3: + python gib_detect_py3.py my name is rob and i like to hack True diff --git a/gib_detect_py3.py b/gib_detect_py3.py new file mode 100644 index 0000000..4b8afc9 --- /dev/null +++ b/gib_detect_py3.py @@ -0,0 +1,12 @@ +#!/usr/bin/python + +import pickle +import gib_detect_train + +model_data = pickle.load(open('gib_model.pki', 'rb')) + +while True: + l = input() + model_mat = model_data['mat'] + threshold = model_data['thresh'] + print gib_detect_train.avg_transition_prob(l, model_mat) > threshold diff --git a/gib_detect_train_py3.py b/gib_detect_train_py3.py new file mode 100644 index 0000000..27b696d --- /dev/null +++ b/gib_detect_train_py3.py @@ -0,0 +1,75 @@ +#!/usr/bin/python + +import math +import pickle + +accepted_chars = 'abcdefghijklmnopqrstuvwxyz ' + +pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)]) + +def normalize(line): + """ Return only the subset of chars from accepted_chars. + This helps keep the model relatively small by ignoring punctuation, + infrequenty symbols, etc. """ + return [c.lower() for c in line if c.lower() in accepted_chars] + +def ngram(n, l): + """ Return all n grams from l after normalizing """ + filtered = normalize(l) + for start in range(0, len(filtered) - n + 1): + yield ''.join(filtered[start:start + n]) + +def train(): + """ Write a simple model as a pickle file """ + k = len(accepted_chars) + # Assume we have seen 10 of each character pair. This acts as a kind of + # prior or smoothing factor. This way, if we see a character transition + # live that we've never observed in the past, we won't assume the entire + # string has 0 probability. + counts = [[10 for i in range(k)] for i in range(k)] + + # Count transitions from big text file, taken + # from http://norvig.com/spell-correct.html + for line in open('big.txt'): + for a, b in ngram(2, line): + counts[pos[a]][pos[b]] += 1 + + # Normalize the counts so that they become log probabilities. + # We use log probabilities rather than straight probabilities to avoid + # numeric underflow issues with long texts. + # This contains a justification: + # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/ + for i, row in enumerate(counts): + s = float(sum(row)) + for j in range(len(row)): + row[j] = math.log(row[j] / s) + + # Find the probability of generating a few arbitrarily choosen good and + # bad phrases. + good_probs = [avg_transition_prob(l, counts) for l in open('good.txt')] + bad_probs = [avg_transition_prob(l, counts) for l in open('bad.txt')] + + # Assert that we actually are capable of detecting the junk. + assert min(good_probs) > max(bad_probs) + + # And pick a threshold halfway between the worst good and best bad inputs. + thresh = (min(good_probs) + max(bad_probs)) / 2 + pickle.dump({'mat': counts, 'thresh': thresh}, open('gib_model.pki', 'wb')) + +def avg_transition_prob(l, log_prob_mat): + """ Return the average transition prob from l through log_prob_mat. """ + log_prob = 0.0 + transition_ct = 0 + for a, b in ngram(2, l): + log_prob += log_prob_mat[pos[a]][pos[b]] + transition_ct += 1 + # The exponentiation translates from log probs to probs. + return math.exp(log_prob / (transition_ct or 1)) + +if __name__ == '__main__': + train() + + + + +