diff --git a/.gitignore b/.gitignore
index b6e4761..30c3be0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+Madamira-Arapy/arapy/resources/
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/README.md b/Madamira-Arapy/arapy/README.md
new file mode 100644
index 0000000..169c63f
--- /dev/null
+++ b/Madamira-Arapy/arapy/README.md
@@ -0,0 +1,7 @@
+# Arapy
+Arabic text processing tools for python - A work in progress.
+
+# Dependencies
+gensim for word2vec: pip install gensim
+goslate for translation: pip install gensim
+madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/
diff --git a/Madamira-Arapy/arapy/__init__.py b/Madamira-Arapy/arapy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Madamira-Arapy/arapy/arapy.py b/Madamira-Arapy/arapy/arapy.py
new file mode 100644
index 0000000..c6f7b83
--- /dev/null
+++ b/Madamira-Arapy/arapy/arapy.py
@@ -0,0 +1,4 @@
+### Arapy module!
+
+from __future__ import absolute_import
+from __future__ import print_function
diff --git a/Madamira-Arapy/arapy/arwiki.py b/Madamira-Arapy/arapy/arwiki.py
new file mode 100644
index 0000000..c208f72
--- /dev/null
+++ b/Madamira-Arapy/arapy/arwiki.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Tools to parse arwiki dumps
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import arapy.normalization as norm
+import re
+import sys
+import codecs
+import xml.etree.cElementTree as etree
+
+def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False):
+    """
+    Reads in an unzipped arwiki dump.
+    Saves the text of the articles in a txt file with one sentence per line.
+    returns the name of the output file
+    """
+    # text tag that wiki uses to identify text content blocks
+    text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text'
+    junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|(<ref\b[^>]*>(.*?)</ref>)|(<ht[^>]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|(<sup>)|(</sup>)|(</sub>)|(<sub>)|(</br>)|(<br>)|(<math>)|(</math>)"
+    punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]"
+
+    with open(dump_in, 'r') as infile:
+        with open(dump_out, 'w') as outfile:
+
+            # iterate through the xml tree looking for tag starts
+            context = etree.iterparse(infile, events = ('start','end'))
+            context = iter(context)
+            event, root = context.next()
+
+            for event, elem in context:
+
+                # if the tag matches the wiki tag for text content, we extract the text
+                if event == 'end' and elem.tag == text_tag:
+
+                    text = elem.text
+                    #print(text)
+                    
+                    # some text tags are empty
+                    if text:
+                    
+                        text = re.sub(junkPattern, '', text)
+
+                        if remove_non_arabic:
+                            text = norm.normalize_charset(text)
+                        
+                        # move each sentence to a new line (rough regex)
+                        if split_at_punc:
+                            text = re.sub(r'[.!?]$', '\n', text)
+
+                        text = re.sub(punctuationPattern, '', text)
+
+                        for line in text.split('\n'):
+                            if line.strip() != '':
+                                outfile.write((line+'\n').encode('utf8'))
+
+                    # keep memory free of previous branches of the xml tree
+                    root.clear()
+
+    return dump_out
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/info/ResultsSummary.txt b/Madamira-Arapy/arapy/info/ResultsSummary.txt
new file mode 100644
index 0000000..3fb6b4b
--- /dev/null
+++ b/Madamira-Arapy/arapy/info/ResultsSummary.txt
@@ -0,0 +1,9 @@
+Current optimal parameterization for generating arabic word vectors (tested on wiki data):
+
+CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words
+
+Some work that I've read uses 100 dim, I think 200 is better for large data sets.
+
+In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this.
+
+Similarly, larger datasets may be able to take advantage of higher dimensional vectors.
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/info/accuracy-notes.xlsx b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx
new file mode 100644
index 0000000..82c16db
Binary files /dev/null and b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx differ
diff --git a/Madamira-Arapy/arapy/madamira.py b/Madamira-Arapy/arapy/madamira.py
new file mode 100644
index 0000000..8f32b3f
--- /dev/null
+++ b/Madamira-Arapy/arapy/madamira.py
@@ -0,0 +1,859 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Madamira output processing tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+from xml.etree.cElementTree import iterparse
+
+import arapy
+import arapy.normalization as norm
+import codecs
+import csv
+import numpy as np
+import os
+import re
+import requests
+import socket
+import io
+import subprocess
+import time
+
+MADAPORT = 8223
+#94223
+
+class Madamira:
+    url="http://localhost:" + str(MADAPORT)
+    headers = {'Content-Type': 'application/xml'}
+    xml_prefix="""<?xml version="1.0" encoding="UTF-8"?>
+    <!--
+      ~ Copyright (c) 2013. The Trustees of Columbia University in the City of New York.
+      ~ The copyright owner has no objection to the reproduction of this work by anyone for
+      ~ non-commercial use, but otherwise reserves all rights whatsoever.  For avoidance of
+      ~ doubt, this work may not be reproduced, or modified, in whole or in part, for commercial
+      ~ use without the prior written consent of the copyright owner.
+      -->
+
+    <madamira_input xmlns="urn:edu.columbia.ccls.madamira.configuration:0.1">
+        <madamira_configuration>
+            <preprocessing sentence_ids="true" separate_punct="true" input_encoding="UTF8"/>
+            <overall_vars output_encoding="UTF8" dialect="MSA" output_analyses="TOP" morph_backoff="NONE"/>
+            <requested_output>
+                <req_variable name="PREPROCESSED" value="true" />
+
+                <req_variable name="STEM" value="true" />
+                <req_variable name="GLOSS" value="true" />
+                <req_variable name="LEMMA" value="true" />
+                <req_variable name="DIAC" value="true" />
+
+                <req_variable name="ASP" value="true" />
+                <req_variable name="CAS" value="true" />
+                <req_variable name="ENC0" value="true" />
+                <req_variable name="ENC1" value="true" />
+                <req_variable name="ENC2" value="true" />
+                <req_variable name="GEN" value="true" />
+                <req_variable name="MOD" value="true" />
+                <req_variable name="NUM" value="true" />
+                <req_variable name="PER" value="true" />
+                <req_variable name="POS" value="true" />
+                <req_variable name="PRC0" value="true" />
+                <req_variable name="PRC1" value="true" />
+                <req_variable name="PRC2" value="true" />
+                <req_variable name="PRC3" value="true" />
+                <req_variable name="STT" value="true" />
+                <req_variable name="VOX" value="true" />
+
+                <req_variable name="BW" value="true" />
+                <req_variable name="SOURCE" value="true" />
+
+            </requested_output>
+            <tokenization>
+                <scheme alias="ATB" />
+                <scheme alias="ATB4MT" />
+                <scheme alias="MyD3">
+                    <!-- Same as D3 -->
+                    <scheme_override alias="MyD3"
+                                     form_delimiter="\u00B7"
+                                     include_non_arabic="true"
+                                     mark_no_analysis="false"
+                                     token_delimiter=" "
+                                     tokenize_from_BW="false">
+                        <split_term_spec term="PRC3"/>
+                        <split_term_spec term="PRC2"/>
+                        <split_term_spec term="PART"/>
+                        <split_term_spec term="PRC0"/>
+                        <split_term_spec term="REST"/>
+                        <split_term_spec term="ENC0"/>
+                        <token_form_spec enclitic_mark="+"
+                                         proclitic_mark="+"
+                                         token_form_base="WORD"
+                                         transliteration="UTF8">
+                            <normalization type="ALEF"/>
+                            <normalization type="YAA"/>
+                            <normalization type="DIAC"/>
+                            <normalization type="LEFTPAREN"/>
+                            <normalization type="RIGHTPAREN"/>
+                        </token_form_spec>
+                    </scheme_override>
+                </scheme>
+            </tokenization>
+        </madamira_configuration>
+
+        <in_doc id="in_doc">\n"""
+    xml_seg_start = """<in_seg id="in_seg">\n"""
+    xml_seg_end = """\n</in_seg>\n"""
+    xml_suffix = """</in_doc>
+
+    </madamira_input>"""
+    config_prefix="{urn:edu.columbia.ccls.madamira.configuration:0.1}"
+
+    def __enter__(self):
+        self.start_server()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.stop_server()
+
+    def start_server(self):
+        cwd = os.getcwd()
+        os.chdir(os.path.dirname(arapy.__file__)+"/resources/MADAMIRA-release-20170403-2.1/")
+
+        self.pid = subprocess.Popen(['java', 
+                                     '-Xmx2500m', 
+                                     '-Xms2500m', 
+                                     '-XX:NewRatio=3', 
+                                     '-jar', 
+                                     'MADAMIRA-release-20170403-2.1.jar', 
+                                     '-s', 
+                                     '-msaonly'])
+
+        print("Waiting for madamira to initialize.")
+        time.sleep(10)
+
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        result = sock.connect_ex(('localhost',MADAPORT))
+        while(result != 0):
+            sock.close()
+            time.sleep(1)
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            result = sock.connect_ex(('localhost',MADAPORT))
+
+        os.chdir(cwd)
+
+        self.session = requests.Session()
+
+    def stop_server(self):
+        self.session.close()
+        self.pid.kill()
+        print("Shut down MADAMIRA.")
+
+    def process(self, text):
+        """ Returns madamira xml output for a string input """
+
+        query = io.StringIO()
+        query.write(Madamira.xml_prefix)
+
+        for sentence in text:
+            query.write(Madamira.xml_seg_start)
+            query.write(sentence)
+            query.write(Madamira.xml_seg_end)
+
+        query.write(Madamira.xml_suffix)
+        query.seek(0)
+
+        response = self.session.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8'))
+
+        response.encoding = "utf8"
+
+        return MadamiraOutput(response.text)
+
+    def process_sentence(self, text):
+        """ Returns madamira xml output for a word string input """
+
+        query = io.StringIO()
+        query.write(Madamira.xml_prefix)
+    
+        query.write(Madamira.xml_seg_start)
+        query.write(text)
+        query.write(Madamira.xml_seg_end)
+
+        query.write(Madamira.xml_suffix)
+        query.seek(0)
+
+        response = requests.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8'))
+
+        response.encoding = "utf8"
+
+        return [word for doc in MadamiraOutput(response.text).docs() for sent in doc.sentences() for word in sent.words()]
+
+class MadamiraOutput:
+    def __init__(self, xmltext):
+        self.xml = xmltext.encode("utf8")
+
+    def docs(self):
+        # madamira config prefix
+        mp=Madamira.config_prefix
+
+        # get an iterable TODO use raw string?
+        # wrapper = codecs.StreamReader(io.StringIO(self.xml), "utf8")
+        context = iterparse(io.BytesIO(self.xml), events=("start", "end"))
+
+        # turn it into an iterator
+        context = iter(context)
+
+        # get the root element
+        event, root = context.__next__()
+
+        for event, elem in context:
+            
+            # parse each doc
+            if event == 'end' and elem.tag == mp+'out_doc':
+
+                yield MadamiraDoc(elem)
+
+                # don't keep the doc in memory
+                root.clear()#find(mp+'madamira_output').clear()
+
+class MadamiraDoc:
+    def __init__(self, elem):
+        self.elem = elem
+
+    def sentences(self):
+        mp = Madamira.config_prefix
+
+        for sentence in self.elem.iter(mp+'out_seg'):
+
+            yield MadamiraSentence(sentence)
+
+class MadamiraSentence:
+    def __init__(self, sentence):
+        self.sentence = sentence
+
+    def words(self):
+        mp = Madamira.config_prefix
+
+        for word in self.sentence.find(mp+'word_info').iter(mp+'word'):
+
+            yield MadamiraWord(word)
+
+    def chunks(self):
+        mp = Madamira.config_prefix
+
+        # should just be one segment_info per out_seg
+        # parse each chunk in segment, looking for noun phrases
+        for chunk in sentence.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+            yield MadamiraChunk(chunk)
+
+class MadamiraWord:
+    """
+    Use the get_attribute function to return a selected attribute
+    
+    Example XML Word:
+    <word id="0" word="القصة">
+        <svm_prediction>
+            <morph_feature_set diac="القِصَّةِ" lemma="قِصَّةqiS~ap" pos="noun" prc3="0" prc2="0" prc1="0" prc0="Al_det" per="na" asp="na" vox="na" mod="na" gen="f" num="s" stt="d" cas="n" enc0="0"/>
+        </svm_prediction>
+        <analysis rank="0" score="0.8840385512557991">
+            <morph_feature_set diac="القِصَّةُ" lemma="قِصَّة_1" bw="Al/DET+qiS~/NOUN+ap/NSUFF_FEM_SG+u/CASE_DEF_NOM" gloss="story" pos="noun" prc3="0" prc2="0" prc1="0" prc0="Al_det" per="na" asp="na" vox="na" mod="na" gen="f" num="s" stt="d" cas="n" enc0="0" source="lex" stem="قِصّ"/>
+        </analysis>
+        <tokenized scheme="ATB">
+            <tok id="0" form0="القصة"/>
+        </tokenized>
+        <tokenized scheme="ATB4MT">
+            <tok id="0" form0="القصة" form1="القصة" form2="قِصَّة" form3="NOM" form4="DT+NN" form5="DET+NOUN+NSUFF_FEM_SG+CASE_DEF_NOM"/>
+        </tokenized>
+        <tokenized scheme="MyD3">
+            <tok id="0" form0="ال+"/>
+            <tok id="1" form0="قصة"/>
+        </tokenized>
+    </word>
+    """
+    svm_predictions_attribute_names_ = ['diac', 'lemma', 'pos', 'prc3', 'prc2', 'prc1', 'prc0', 'per', 'asp', 'vox', 'mod', 'gen', 'num', 'stt', 'cas', 'enc0']
+
+    def __init__(self, word):
+        self.word = word
+        
+
+        
+    def lemma(self):
+        mp = Madamira.config_prefix
+
+        # grab the lemma data
+        lemma = self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma')
+
+        # strip down to the arabic script
+        if not lemma:
+            return ""
+        elif len(lemma) == 0:
+            return ""
+        else:
+            norm_lemma = norm.normalize_charset(lemma).strip()
+            if len(norm_lemma) == 0:
+                return lemma
+            else:
+                return norm_lemma
+
+    def get_orig_word(self):
+        mp = Madamira.config_prefix
+
+        return self.word.attrib['word']
+
+        
+    def pos(self):
+        mp = Madamira.config_prefix
+
+        return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('pos')
+
+    def pos_gender(self):
+        mp = Madamira.config_prefix
+
+        return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('gen')
+    
+    def get_attribute(self, attribute=''):
+        mp = Madamira.config_prefix
+        return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get(attribute)
+
+    def tokens(self):
+        mp = Madamira.config_prefix
+
+        # grab the lemma data
+        tokens = []
+        for token in self.word.find(mp+"tokenized[@scheme='MyD3']").iter(mp+'tok'):
+            tokens.append(token.get('form0'))
+
+        return tokens
+
+
+class MadamiraChunk:
+    def __init__(self, chunk):
+        self.chunk = chunk
+
+    def type(self):
+        return self.get('type')
+
+    # def tokens():
+    #     # combine tokens into phrase
+    #     for tok in chunk.iter(mp+'tok'):
+    #         segment = tok.get('form0')
+
+    #         if segment[-1] == '+':
+    #             noun_phrase += segment[:-1]
+    #         elif segment[0] == '+':
+    #             # if it is a suffix prep, attach it to prev token
+    #             if len(noun_phrase) > 0:
+    #                 noun_phrase = noun_phrase[:-1] + segment[1:]
+    #             else:
+    #                 noun_phrase = segment[1:]
+    #         else:
+    #             noun_phrase += segment + '_'
+
+    #     # drop the last underscore and add to the np sentence
+    #     if noun_phrase[-1] == '_':
+    #         noun_phrase = noun_phrase[:-1]
+    #     sent += noun_phrase+' '
+
+
+
+def transform_sentence_file(sentence_file, 
+                            lemmaout="lemmas.txt",
+                            tokenout="token.txt",
+                            posout="pos.txt",
+                            lemmas=True,
+                            pos=False, 
+                            tokens=False):
+    """returns filenames of lemmas and pos files"""
+    with Madamira() as m:
+
+        # open output files
+        lemma_out = None
+        lemma_buff = None
+        if lemmas:
+            lemma_buff = io.StringIO()
+            lemma_file = lemmaout
+            lemma_out = open(lemma_file, 'w')
+
+        pos_out = None
+        pos_buff = None
+        if pos:
+            pos_buff = io.StringIO()
+            pos_file = posout
+            pos_out = open(pos_file, 'w')
+
+        token_out = None
+        token_buff = None
+        if tokens:
+            token_buff = io.StringIO()
+            token_file = tokenout
+            token_out = open(token_file, 'w')
+
+        # read files into a list, or buffer the sentences one at a time, of sentences
+        # sentence_list = open(sentence_file).read().splitlines()
+        with open(sentence_file, 'r') as sentences:
+            for sentence in sentences:
+
+                out = m.process([sentence])
+
+                for doc in out.docs():
+                    for sent in doc.sentences():
+
+                        for word in sent.words():
+                            if lemmas:
+                                lemma_buff.write(word.lemma())
+                                lemma_buff.write(" ")
+                            if pos:                        
+                                pos_buff.write(word.pos())
+                                pos_buff.write(" ")
+                            if tokens:
+                                for token in word.tokens():
+                                    token_buff.write(token)
+                                    token_buff.write(" ")
+
+                        # for chunk in sent.chunks()
+                        #     if tokens:   
+                                # token_list = word.tokens()
+                                # for token in token_list:                    
+                                #     token_buff.write(token) # TODO
+                                #     token_buff.write(" ")
+
+                        if lemmas:
+
+                            lemma_buff.seek(0)
+                            lemma_out.write(lemma_buff.read().rstrip().encode('utf8'))
+                            lemma_out.write('\n')
+                            lemma_buff.close()
+                            lemma_buff = io.StringIO()
+
+                        if pos:
+                            pos_buff.seek(0)
+                            pos_out.write(pos_buff.read().rstrip().encode('utf8'))
+                            pos_out.write('\n')
+                            pos_buff.close()
+                            pos_buff = io.StringIO()
+
+                        if tokens:
+                            token_buff.seek(0)
+                            token_out.write(token_buff.read().rstrip().encode('utf8'))
+                            token_out.write('\n')
+                            token_buff.close()
+                            token_buff = io.StringIO()
+                
+            if lemmas:
+                lemma_buff.close()
+                lemma_out.close()
+            if pos:
+                pos_buff.close()
+                pos_out.close()
+            if tokens:
+                token_buff.close()
+                token_out.close()
+
+
+    return [lemma_file, pos_file, token_file]
+
+    
+
+
+# def save_lemmatization(xml_mada_fn, out_fn):
+#     """
+#     Saves a lemmatization from a madamira xml output file
+#     """
+
+#     # open the output file    
+#     outfile = codecs.open(out_fn, 'w', "utf-8")
+
+#     # madamira config prefix
+#     mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+#     # get an iterable
+#     context = iterparse(xml_mada_fn, events=("start", "end"))
+
+#     # turn it into an iterator
+#     context = iter(context)
+
+#     # get the root element
+#     event, root = context.next()
+
+#     for event, elem in context:
+        
+#         # parse each sentence
+#         if event == 'end' and elem.tag == mp+'out_seg':
+
+#             # construct the sentence, then write once per sentence
+#             sent = ''
+
+#             # should just be one word_info per out_seg
+#             # parse each word in word_info
+#             for word in elem.find(mp+'word_info').iter(mp+'word'):
+
+#                 # grab the lemma data
+#                 lemma = word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma')
+
+#                 # strip all but arabic script TODO
+#                 lemma = lemma.split('_')[0]
+
+#                 # normalize the script
+#                 lemma = norm.normalize(lemma)
+
+#                 sent += lemma
+#                 sent += ' '
+
+#             # write the sentence out (without last space)
+#             outfile.write(sent[:-1]+'\n')
+
+#             # don't keep the sentence in memory
+#             root.find(mp+'out_doc').clear()
+
+#         elif event == 'end' and elem.tag == mp+'out_doc':
+#             outfile.write('#ENDDOC#\n')
+
+# def save_noun_phrases(xml_mada_fn, out_fn):
+#     """
+#     Saves noun phrases from a madamira xml output file
+#     """
+
+#     # open the output file    
+#     outfile = codecs.open(out_fn, 'w', "utf-8")
+
+#     # madamira config prefix
+#     mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+#     # get an iterable
+#     context = iterparse(xml_mada_fn, events=("start", "end"))
+
+#     # turn it into an iterator
+#     context = iter(context)
+
+#     # get the root element
+#     event, root = context.next()
+
+#     for event, elem in context:
+        
+#         # parse each sentence
+#         if event == 'end' and elem.tag == mp+'out_seg':
+
+#             # construct the sentence, then write once per sentence
+#             sent = ''
+
+#             # should just be one segment_info per out_seg
+#             # parse each chunk in segment, looking for noun phrases
+#             for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+
+#                 # identify noun phrases
+#                 if chunk.get('type') == 'NP':
+
+#                     # we build noun phrases with underscores between words
+#                     noun_phrase = ''
+
+#                     # combine tokens into phrase
+#                     for tok in chunk.iter(mp+'tok'):
+#                         segment = tok.get('form0')
+
+#                         if segment[-1] == '+':
+#                             noun_phrase += segment[:-1]
+#                         elif segment[0] == '+':
+#                             # if it is a suffix prep, attach it to prev token
+#                             if len(noun_phrase) > 0:
+#                                 noun_phrase = noun_phrase[:-1] + segment[1:]
+#                             else:
+#                                 noun_phrase = segment[1:]
+#                         else:
+#                             noun_phrase += segment + '_'
+
+#                     # drop the last underscore and add to the np sentence
+#                     if noun_phrase[-1] == '_':
+#                         noun_phrase = noun_phrase[:-1]
+#                     sent += noun_phrase+' '
+
+#             # write the noun phrase sentence out (without last space)
+#             outfile.write(sent[:-1]+'\n')
+
+#             # don't keep the segment in memory
+#             root.find(mp+'out_doc').clear()
+
+#         elif event == 'end' and elem.tag == mp+'out_doc':
+#             outfile.write('#ENDDOC#\n')
+
+# def save_noun_phrase_graph(xml_mada_fn, out_fn, window = 5):
+#     """ 
+#     TODO figure out how expensive this is, implement in scala/spark next
+#     Saves a noun phrase graph from a madamira xml output file
+#     """
+
+#     # edges (nodeid, nodeid, dist)
+#     edges = []
+
+#     # vertices (long hash(str) : str (noun phrase))
+#     vertices = {}
+
+#     # mentions list
+#     mentions_list = []
+
+#     # distance to add edges at
+#     distance = 10
+
+#     # open the output file    
+#     outfile = codecs.open(out_fn, 'w', "utf-8")
+
+#     # madamira config prefix
+#     mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+#     # get an iterable
+#     context = iterparse(xml_mada_fn, events=("start", "end"))
+
+#     # turn it into an iterator
+#     context = iter(context)
+
+#     # get the root element
+#     event, root = context.next()
+
+#     # document token tracking
+#     tokens_so_far = 0
+
+#     for event, elem in context:
+        
+#         # parse each sentence
+#         if event == 'end' and elem.tag == mp+'out_seg':
+
+#             # should just be one segment_info per out_seg
+#             # parse each chunk in segment, looking for noun phrases
+#             for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+
+#                 # identify noun phrases
+#                 if chunk.get('type') == 'NP':
+
+#                     # we build noun phrases with underscores between words
+#                     noun_phrase = ''
+
+#                     # noun phrase starts on next token
+#                     noun_phrase_start = tokens_so_far + 1
+
+#                     # combine tokens into phrase
+#                     for tok in chunk.iter(mp+'tok'):
+
+#                         tokens_so_far += 1
+
+#                         segment = tok.get('form0')
+
+#                         # builds phrase
+#                         if segment[-1] == '+':
+#                             noun_phrase += segment[:-1]
+#                         elif segment[0] == '+':
+#                             # if it is a suffix prep, attach it to prev token
+#                             if len(noun_phrase) > 0:
+#                                 noun_phrase = noun_phrase[:-1] + segment[1:]
+#                             else:
+#                                 noun_phrase = segment[1:]
+#                         else:
+#                             noun_phrase += segment + '_'
+
+#                     # drop the last underscore and add to the np sentence
+#                     noun_phrase = noun_phrase.strip('_')
+
+#                     # noun phrase ended on last token
+#                     noun_phrase_end = tokens_so_far
+
+#                     np_hash = hash(noun_phrase)
+#                     vertices[np_hash] = noun_phrase
+
+#                     mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+
+#                 else:
+#                     for tok in chunk.iter(mp+'tok'):
+#                         tokens_so_far += 1
+
+#             # don't keep the segment in memory
+#             root.find(mp+'out_doc').clear()
+
+#         elif event == 'end' and elem.tag == mp+'out_doc':
+#             # add edges from last document
+#             for start in range(0, len(mentions_list) - 10):
+#                 end = start + 10
+#                 head = start
+#                 tail = start + 1
+#                 in_range = True
+#                 while in_range:
+#                     if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance:
+#                         if (mentions_list[tail][1]-mentions_list[head][2] > 0):
+#                             dist = mentions_list[tail][1]-mentions_list[head][2]
+#                         else:
+#                             dist = 0
+                        
+#                         edges.append([mentions_list[head][0], mentions_list[tail][0], dist])
+#                         edges.append([mentions_list[tail][0], mentions_list[head][0], dist])
+
+#                         tail += 1
+
+#                     else:
+#                         in_range = False
+
+#     np.savetxt("edges.csv", np.array(edges), delimiter=",", fmt='%i')
+#     writer = csv.writer(open('vertices.csv','wb'))
+#     for key, value in vertices.items():
+#         writer.writerow([key, value.encode('utf-8')])    
+
+# def raw_save_lemmatization(raw_mada_fn, out_fn):
+#     """
+#     Saves a lemmatization from a madamira raw output file
+#     """
+#     mada = codecs.open(mada_fn, 'r', "utf-8")
+
+#     p = re.compile(r'lex:[^\s_]+', re.UNICODE)
+#     start_of_line = True
+
+#     with codecs.open(out_fn, 'w', 'utf-8') as outfile:
+#         for line in mada:
+
+#             if line == 'SENTENCE BREAK\n':
+#                 outfile.write('\n')
+#                 start_of_line = True
+#             elif line.startswith('*'):
+#                 m = p.findall(line)
+#                 if m:
+#                     print(m)
+#                     if not start_of_line:
+#                         outfile.write(' ')
+#                     outfile.write(norm.normalize(m[0][4:]))
+#                     start_of_line = False
+
+# def raw_save_noun_phrase_graph(raw_bpcbio_fn, out_fn, window = 5):
+#     """
+#     TODO figure out how expensive this is, implement in scala/spark next
+#     Saves a noun phrase graph from a madamira raw output file
+#     """
+
+#     # edges (nodeid, nodeid, dist)
+#     edges = []
+
+#     # vertices (long hash(str) : str (noun phrase))
+#     vertices = {}
+
+#     # mentions list
+#     mentions_list = []
+
+#     # distance to add edges at
+#     distance = 10
+
+#     # open the output file    
+#     outfile = codecs.open(out_fn, 'w', "utf-8")
+
+#     # get an iterable
+#     context = codecs.open(raw_bpcbio_fn, 'r', 'utf-8')
+
+#     # document token tracking
+#     tokens_so_far = 0
+
+#     # keep track of NP construction
+#     noun_phrase = ''
+#     noun_phrase_start = 0
+#     noun_phrase_end = 0
+
+#     # count sentences if not broken into docs
+#     sentence_idx = 1
+
+#     # count docs
+#     doc_count = 1
+
+#     with codecs.open('raw_edges.csv', 'a') as edge_file:
+#         with codecs.open(raw_bpcbio_fn, 'r', 'utf-8') as context:
+#             while True:
+#                 line = context.readline()
+
+#                 if not line or sentence_idx % 100 == 0:
+
+#                     sentence_idx = 1
+#                     print('Doc: ', doc_count, 'Vertices: ', len(vertices))
+#                     doc_count += 1
+
+#                     if noun_phrase != '':
+#                         noun_phrase = noun_phrase.strip('_')
+#                         noun_phrase_end = tokens_so_far - 1
+#                         np_hash = hash(noun_phrase)
+#                         vertices[np_hash] = noun_phrase
+#                         mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+#                         noun_phrase = ''
+
+#                     # edge_count = 0
+
+#                     # add edges from last document
+#                     for start in range(0, len(mentions_list) - 10):
+#                         # end = start + 10
+#                         head = start
+#                         tail = start + 1
+#                         in_range = True
+#                         while in_range and tail < len(mentions_list):
+#                             if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance:
+#                                 if (mentions_list[tail][1]-mentions_list[head][2] > 0):
+#                                     dist = mentions_list[tail][1]-mentions_list[head][2]
+#                                 else:
+#                                     dist = 0
+                                
+#                                 edges.append([mentions_list[head][0], mentions_list[tail][0], dist])
+#                                 edges.append([mentions_list[tail][0], mentions_list[head][0], dist])
+
+#                                 # edge_count += 2
+#                                 tail += 1
+
+#                             else:
+#                                 in_range = False    
+
+#                     # print('Adding: ', edge_count, ' edges.')     
+
+#                     np.savetxt(edge_file, np.array(edges), delimiter=",", fmt='%i')
+                    
+
+#                     edges = []
+#                     mentions_list = []
+
+#                     if not line:
+#                         writer = csv.writer(open('raw_vertices.csv','wb'))
+#                         for key, value in vertices.items():
+#                             writer.writerow([key, value.encode('utf-8')])
+#                         break
+
+
+#                 if line.strip() == "":
+
+#                     # end of sentence
+#                     sentence_idx += 1
+
+#                 else:
+
+#                     tokens_so_far += 1
+
+#                     text, bpc_type = line.strip().split("\t")
+
+#                     if (bpc_type == 'B-NP' or bpc_type != 'I-NP') and noun_phrase != '':
+
+#                         noun_phrase = noun_phrase.strip('_')
+#                         noun_phrase_end = tokens_so_far - 1
+#                         np_hash = hash(noun_phrase)
+#                         vertices[np_hash] = noun_phrase
+#                         mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+#                         noun_phrase = ''
+
+#                     if bpc_type == 'B-NP':
+
+#                         # noun phrase starts on this token
+#                         noun_phrase_start = tokens_so_far
+
+#                         if text[-1] == '+':
+#                             noun_phrase += text[:-1]
+#                         elif text[0] == '+':
+#                                 noun_phrase = text[1:]
+#                         else:
+#                             noun_phrase += text + '_'
+
+#                     elif bpc_type == 'I-NP':
+
+#                         if text[-1] == '+':
+#                             noun_phrase += text[:-1]
+#                         elif text[0] == '+':
+#                             # if it is a suffix prep, attach it to prev token
+#                             if len(noun_phrase) > 0:
+#                                 noun_phrase = noun_phrase[:-1] + text[1:]
+#                             else:
+#                                 noun_phrase = text[1:]
+#                         else:
+#                             noun_phrase += text + '_'              
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/normalization.py b/Madamira-Arapy/arapy/normalization.py
new file mode 100644
index 0000000..544c05a
--- /dev/null
+++ b/Madamira-Arapy/arapy/normalization.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic script normalization tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import re
+import codecs
+
+# regex for arabic chars
+inv_arabic_charset = re.compile(r'[^\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\u0030-\u0039\n\.]+', re.UNICODE)
+
+def normalize(text, ar_only=True, digits=False, alif=True, hamza=True, yaa=True, tashkil=True):
+    """
+    Normalizes arabic text
+    Removes non-arabic chars by default
+    Changes all numerals to # if digits is true, default false
+    Normalizes alif, hamza, and yaa by default
+    Removes supplementary diacritics
+    """
+    if ar_only:
+        text = normalize_charset(text)
+
+    if digits:
+        text = normalize_digits(text)
+
+
+    if alif:
+        text = normalize_alif(text)
+    if hamza:
+        text = normalize_hamza(text)
+    if yaa:
+        text = normalize_yaa(text)
+
+    if tashkil:
+        text = remove_tashkil(text)
+
+    return text
+
+def normalize_sentence_file(sentence_file, outfile_path="normal.txt", ar_only=True, digits=True, alif=True, hamza=True, yaa=True, tashkil=True):
+    """
+    Normalizes a file of sentences and saves to a file w/ parameterized naming scheme
+    returns the outfile name
+    """
+
+    # outfile_path = (sentence_file.split('.')[0]+
+    #                "_ar_only"+str(ar_only)+
+    #                "_digits"+str(digits)+
+    #                "_alif"+str(alif)+
+    #                "_hamza"+str(hamza)+
+    #                "_yaa"+str(yaa)+
+    #                "_tashkil"+str(tashkil)+
+    #                ".txt")
+
+    with open(sentence_file, 'r') as infile:
+        with open(outfile_path, 'w') as outfile:
+            for text in infile:
+                text = text.decode('utf8')
+
+                text = normalize(text, ar_only=ar_only, digits=digits, alif=alif, hamza=hamza, yaa=yaa, tashkil=tashkil)
+                
+                if text:
+                    outfile.write(text.encode('utf8'))
+
+    return outfile_path
+
+def remove_tashkil(text):
+    """ removes set of arabic supplementary diacritics """
+    text = remove_harakat(text)
+    text = remove_tanwin(text)
+    text = remove_shaddah(text)
+    text = remove_kashida(text)
+    return text
+
+#####################
+### Normalization ###
+#####################
+
+def normalize_charset(text):
+    return inv_arabic_charset.sub(' ', text)
+
+def normalize_digits(text):
+    """ replaces all forms of numbers with # """
+    return re.sub(r'[0123456789٠١٢٣٤٥٦٧٨٩]', r'#', text)
+
+def normalize_alif(text):
+    """ replaces all forms of alif with ا """
+    return re.sub(r'[إأٱآا]', r'ا', text)
+
+def normalize_yaa(text):
+    """ replaces ى with ي """
+    return re.sub(r'ى', r'ي', text)
+
+def normalize_hamza(text, normalize_alif = False):
+    """
+    replaces hamza on seats with ء
+    does not include alif seats by default
+    set normalize_alif=True to replace إأ with hamza
+    """
+    if normalize_alif:
+        return re.sub(r'[ؤئإأ]', r'ء', text)
+    else:
+        return re.sub(r'[ؤئ]', r'ء', text)
+
+#######################
+### Tashkil removal ###
+#######################
+
+def remove_harakat(text):
+    """
+    removes short vowel marks
+    does not normalize alif forms
+    does not remove tanwin (ًٌٍ) (use remove_tanwin)
+    """
+    return re.sub(r'[َُِْ]', r'', text)
+
+def remove_tanwin(text):
+    """
+    removes tanwin vowel marks
+    does not normalize alif forms
+    """
+    return re.sub(r'[ًٌٍ]', r'', text)
+
+def remove_shaddah(text):
+    """
+    removes the shaddah mark (tashdid)
+    """
+    return re.sub(r'[ّ]', r'', text)
+
+def remove_kashida(text):
+    """
+    removes the kashida elongation mark (tatwil)
+    """
+    return re.sub(r'[ـ]', r'', text)
+
+    text = re.sub(noise, '', text)
+    return text
+
+##################################
+### Buckwalter transliteration ###
+##################################
+
+def unicode_to_bw(string, reverse=0):
+    """
+    Given a Unicode string, transliterate into Buckwalter. 
+    To go from Buckwalter back to Unicode, set reverse=1.
+    Partially taken from https://github.com/andyroberts/buckwalter2unicode
+    """
+
+    buck2uni = {"'": u"\u0621", # hamza-on-the-line
+                "|": u"\u0622", # madda
+                ">": u"\u0623", # hamza-on-'alif
+                "&": u"\u0624", # hamza-on-waaw
+                "<": u"\u0625", # hamza-under-'alif
+                "}": u"\u0626", # hamza-on-yaa'
+                "A": u"\u0627", # bare 'alif
+                "b": u"\u0628", # baa'
+                "p": u"\u0629", # taa' marbuuTa
+                "t": u"\u062A", # taa'
+                "v": u"\u062B", # thaa'
+                "j": u"\u062C", # jiim
+                "H": u"\u062D", # Haa'
+                "x": u"\u062E", # khaa'
+                "d": u"\u062F", # daal
+                "*": u"\u0630", # dhaal
+                "r": u"\u0631", # raa'
+                "z": u"\u0632", # zaay
+                "s": u"\u0633", # siin
+                "$": u"\u0634", # shiin
+                "S": u"\u0635", # Saad
+                "D": u"\u0636", # Daad
+                "T": u"\u0637", # Taa'
+                "Z": u"\u0638", # Zaa' (DHaa')
+                "E": u"\u0639", # cayn
+                "g": u"\u063A", # ghayn
+                "_": u"\u0640", # taTwiil
+                "f": u"\u0641", # faa'
+                "q": u"\u0642", # qaaf
+                "k": u"\u0643", # kaaf
+                "l": u"\u0644", # laam
+                "m": u"\u0645", # miim
+                "n": u"\u0646", # nuun
+                "h": u"\u0647", # haa'
+                "w": u"\u0648", # waaw
+                "Y": u"\u0649", # 'alif maqSuura
+                "y": u"\u064A", # yaa'
+                "F": u"\u064B", # fatHatayn
+                "N": u"\u064C", # Dammatayn
+                "K": u"\u064D", # kasratayn
+                "a": u"\u064E", # fatHa
+                "u": u"\u064F", # Damma
+                "i": u"\u0650", # kasra
+                "~": u"\u0651", # shaddah
+                "o": u"\u0652", # sukuun
+                "`": u"\u0670", # dagger 'alif
+                "{": u"\u0671", # waSla
+    }
+
+    # For a reverse transliteration (Unicode -> Buckwalter), a dictionary
+    # which is the reverse of the above buck2uni is essential.
+
+    uni2buck = {}
+
+    # Iterate through all the items in the buck2uni dict.
+    for (key, value) in buck2uni.iteritems():
+            # The value from buck2uni becomes a key in uni2buck, and vice
+            # versa for the keys.
+            uni2buck[value] = key
+
+    if not reverse:     
+        for k,v in buck2uni.iteritems():
+            string = string.replace(v,k)
+
+    else:     
+        for k,v in buck2uni.iteritems():
+            string = string.replace(k,v)
+
+    return string
+
+
diff --git a/Madamira-Arapy/arapy/thesaurus.py b/Madamira-Arapy/arapy/thesaurus.py
new file mode 100644
index 0000000..b1426db
--- /dev/null
+++ b/Madamira-Arapy/arapy/thesaurus.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word thesaurus simulation tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import arapy.translate as trans
+import logging
+import sys
+import json
+import requests
+import urllib2
+
+
+API_KEY = '80901dbb851efc07b4bd747ba3ead0ae' # API key is available from here - http://words.bighugelabs.com/getkey.php
+URL_MASK = 'http://words.bighugelabs.com/api/2/{1}/{0}/json'
+RELATIONSHIP_ABBR = {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'}
+
+def thesaurus(word, relation=None, ngram=0, ar=False, target_result_count=0):
+    """
+    Uses bighugelabs thesaurus API
+    requires the API key available http://words.bighugelabs.com/getkey.php
+
+    Translation is done with google translate in the translate module. Requires google api key
+
+    Takes in a word and retreives a list of related words where
+    the relationship is given by one key from {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'}
+    the words are filtered by ngram, 0 for all
+    if ar = 0, the word is translated before and after from arabic
+    target_result_count is the number of words to return with
+
+    returns a dictionary where keys are the requested relationships, and values are lists of ngrams matching that relationship
+    returns empty dictionary if the thesaurus didn't have any results
+    """
+
+    gs = None
+    if ar:
+        translations = trans.translate_list([word], 'en', 'ar')
+        if len(translations) > 0:
+            word = translations[0]
+        else:
+            logging.info("Couldn't translate word: "+str(word)+" to english")
+            return {}
+
+    if not word:
+        logging.info("Translated word is empty.")
+        return {}
+
+    # format and make the request
+    url = URL_MASK.format(urllib2.quote(word.encode('utf-8')), API_KEY)
+    result = requests.get(url)
+
+    if not result.text:
+        logging.info("Thesaurus had no info for word:"+word.encode('utf-8'))
+        return {}
+
+    json_result = json.loads(result.text)
+
+    # our relationship dictionary
+    words = {}
+    word_count = 0
+
+    # for each sense of the word
+    for pos in json_result:
+
+        # we want only the requested relations 
+        for rel in json_result[pos]:
+            if relation == None or relation == rel:
+
+                # each word matching the relationship
+                for w in json_result[pos][rel]:
+
+                    candidate = w
+
+                    # we only want so many results
+                    if target_result_count == 0 or word_count < target_result_count:
+
+                        if ar == 1:
+                            translations = trans.translate_list([candidate],'ar','en')
+                            if len(translations) > 0:
+                                candidate = translations[0]
+                            else:
+                                logging.info("Couldn't translate word: "+str(candidate)+" to arabic")
+                                return {}
+
+                        if ngram == 0 or len(candidate.split(" ")) == ngram:
+                            if not rel in words:
+                                words[rel] = []
+                            words[rel].append(candidate)
+                            word_count+=1
+
+                    else:
+                        # we have enough results
+                        return words
+    return words
+
+
+
diff --git a/Madamira-Arapy/arapy/translate.py b/Madamira-Arapy/arapy/translate.py
new file mode 100644
index 0000000..85ca3b1
--- /dev/null
+++ b/Madamira-Arapy/arapy/translate.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word translation tools for google translate api: https://cloud.google.com/translate/v2/pricing
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import logging
+import sys
+import json
+import requests
+import urllib2
+
+
+GOOGLE_API_KEY = 'AIzaSyAAScZ3-Ut-1sxn5gsSLzxzXJgzn3jGsN4'
+URL_MASK = 'https://www.googleapis.com/language/translate/v2?key={0}{1}&source={2}&target={3}'
+
+def translate_list(words, target='ar', source='en'):
+    """
+    Translates a word with the google translate api
+    Requires an API key from the google developers console
+    Target is the language target, ie 'ar'
+    """
+
+    # format the words for the url
+    formatted_words=""
+    for word in words:
+        formatted_words += "&q=" + urllib2.quote(word)
+
+    # format the url for the get
+    url = URL_MASK.format(GOOGLE_API_KEY, formatted_words, source, target)
+    result = requests.get(url)
+
+    if not result.text:
+        logging.info("Google responded with no translations, check api key.")
+        return []
+
+    json_result = json.loads(result.text)
+
+    if not 'data' in json_result:
+        logging.info("Google result had no data element, check api key.")
+        return []
+
+    # parse the result
+    translations = []
+    for translation in json_result['data']['translations']:
+        trans_word = translation['translatedText']
+        logging.info("Translated "+str(word)+ " to "+trans_word.encode('utf-8'))
+        translations.append(trans_word)
+
+    return translations
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/word2vec.py b/Madamira-Arapy/arapy/word2vec.py
new file mode 100644
index 0000000..f65898d
--- /dev/null
+++ b/Madamira-Arapy/arapy/word2vec.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word embedding tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+import logging
+import sys
+
+def train_embeddings(infile, outfile = "embedding.txt", sg=1, size=100, seed = 0, window=8, min_count=5, 
+                     sample=1e-4, hs=0, negative=25, iterations=15):
+    """
+    Saves the model to a file with the parameters in the name.
+    All of these functions work on any language of corpora
+    Uses gensim's training parameters:
+
+    Initialize the model from an iterable of `sentences`. Each sentence is a
+    list of words (unicode strings) that will be used for training.
+
+    The `sentences` iterable can be simply a list, but for larger corpora,
+    consider an iterable that streams the sentences directly from disk/network.
+    See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in
+    this module for such examples.
+
+    If you don't supply `sentences`, the model is left uninitialized -- use if
+    you plan to initialize it in some other way.
+
+    `sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. Otherwise, `cbow` is employed.
+
+    `size` is the dimensionality of the feature vectors.
+
+    `window` is the maximum distance between the current and predicted word within a sentence.
+
+    `alpha` is the initial learning rate (will linearly drop to zero as training progresses).
+
+    `seed` = for the random number generator. Initial vectors for each
+    word are seeded with a hash of the concatenation of word + str(seed).
+
+    `min_count` = ignore all words with total frequency lower than this.
+
+    `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
+        default is 0 (off), useful value is 1e-5.
+
+    `workers` = use this many worker threads to train the model (=faster training with multicore machines).
+
+    `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
+
+    `negative` = if > 0, negative sampling will be used, the int for negative
+    specifies how many "noise words" should be drawn (usually between 5-20).
+
+    `cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
+    Only applies when cbow is used.
+
+    `hashfxn` = hash function to use to randomly initialize weights, for increased
+    training reproducibility. Default is Python's rudimentary built in hash function.
+
+    `iter` = number of iterations (epochs) over the corpus.
+    """
+    # set up logging
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
+
+    # files are iterated over with this object
+    # class MySentences(object):
+    #     def __init__(self, fname):
+    #         self.fname = fname
+    #         self.errors = 0
+
+    #     def __iter__(self):
+    #         for line in open(self.fname):
+    #             yield line.split()
+
+    # sentences = MySentences(infile)
+    sentences = LineSentence(infile)
+
+    
+    model = Word2Vec(sentences, 
+                     sg = sg, 
+                     size = size, 
+                     window = window, 
+                     min_count = min_count, 
+                     hs = hs, 
+                     workers = 4, 
+                     sample = sample,
+                     seed = seed,
+                     negative = negative, 
+                     iter = iterations)
+
+    model.save_word2vec_format(outfile, binary = True)
+
+    return outfile
+
+def start_interactive_test_suite():
+    """
+    Loads a model, then allows interactive tests of:
+    ac - not interactive, rather loads an analogy file and outputs the results
+    one word most similar queries
+    two word similarity measures
+    three word analogy queries
+    four+ word odd one out queries
+    """
+
+    output_spacing = 25
+
+    modelfile = raw_input('Please enter the binary model file path: ')# (or gn/en/ar): ')
+    modelfile = modelfile.strip().strip('\'')
+
+    # if modelfile == 'gn':
+    #     modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin'
+    # elif modelfile == 'ar':
+    #     modelfile = '/Users/king96/Documents/Word2Vec/Models/ar_wiki_seg_vecs.bin'
+    # elif modelfile == 'en':
+    #     modelfile = '/Users/king96/Documents/Word2Vec/Models/en_wiki_vecs.bin'
+
+    # set up logging
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                        level=logging.INFO)
+
+    # load model
+    model = Word2Vec.load_word2vec_format(modelfile, binary=True)
+
+    while True:
+        
+        # offer the menu
+        print('\n')
+        print('Type ac to run accuracy tests.')
+        print('Enter one word for neighbors, two for distance,')
+        print('three for analogy, more for matching, q to quit.')
+        words = raw_input('Word: ')
+
+        words = words.decode('UTF-8', 'replace')
+        
+        if words == 'q':
+            break
+
+        if words == 'ac':
+            print('Please enter the questions file to test on:')
+
+            questions = raw_input('File: ').strip()
+
+            model.accuracy(questions, restrict_vocab = 30000, tries = 5)
+            continue
+
+        # the remaining options take 0 < n query words
+        words = words.split(' ')
+
+        if len(words) == 0:
+            continue
+
+        # top 10 words
+        elif len(words) == 1:
+            try:
+                candidates = model.most_similar(words[0], topn=10)
+                print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing))
+                for word in candidates:
+                    print(str(word[0].encode('UTF-8','replace')).rjust(output_spacing),
+                          str(word[1]).rjust(output_spacing))
+            except KeyError as ke:
+                print(ke.message.encode('utf-8','replace'))
+
+
+        # pair similarity
+        elif len(words) == 2:
+            try:
+                print('Similarity is : ' + str(model.similarity(words[0],words[1])))
+            except KeyError as ke:
+                print(ke.message.encode('utf-8','replace'))
+
+        # analogy
+        elif len(words) == 3:
+            try:
+                candidates = model.most_similar(positive=[words[2], words[1]], 
+                                                negative = [words[0]], 
+                                                topn=10)
+
+                print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing))
+                for word in candidates:
+                    print(str(word[0].encode('UTF-8', 'replace')).rjust(output_spacing), 
+                          str(word[1]).rjust(output_spacing))
+            except KeyError as ke:
+                print(ke.message.encode('utf-8','replace'))
+
+        # odd one out
+        else:
+            try:
+                print('Odd one out: ' + str(model.doesnt_match(words).encode('utf-8', 'replace')))
+            except KeyError as ke:
+                print(ke.message.encode('utf-8','replace'))
+
+def start_query_expander():
+    modelfile = raw_input('Please enter the binary model file path: ')
+    modelfile = modelfile.strip()
+
+    # if modelfile == 'gn':
+    #     modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin'
+
+    # set up logging
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                        level=logging.INFO)
+
+    # load model
+    model = Word2Vec.load_word2vec_format(modelfile, binary=True)
+
+    while True:
+         
+        words = raw_input('\nEnter words to expand, q to quit: ')
+
+        words = words.decode('UTF-8', 'replace')
+        
+        if words == 'q':
+            break
+
+        words = words.split(' ')
+
+        if len(words) == 0:
+            continue
+
+        # top 10 words
+        else:
+            expansion = set()
+
+            for word in words:
+                try:
+                    expansion = expansion | set([x[0] for x in model.most_similar(word, topn=10)])
+                except KeyError as ke:
+                    print(ke.message.encode('utf-8','replace'))
+
+            print('Expansion')
+            for word in expansion:
+                print(str(word.encode('UTF-8','replace')))
\ No newline at end of file
diff --git a/Madamira-Arapy/test_madamira.py b/Madamira-Arapy/test_madamira.py
new file mode 100644
index 0000000..8981ed3
--- /dev/null
+++ b/Madamira-Arapy/test_madamira.py
@@ -0,0 +1,17 @@
+#%%
+from arapy.madamira import Madamira
+
+#%%
+text = "ما هي صفات السبعين ألفا الذين يدخلون الجنة بغير حساب"
+
+with Madamira() as m:
+    out = m.process([text])
+
+# %%
+for doc in out.docs():
+    for sent in doc.sentences():
+        for word in sent.words():
+            print(word.get_orig_word(),": ", word.pos(),"--",word.get_attribute('gen'),"--",word.get_attribute('per'))
+
+
+# %%
diff --git a/README.md b/README.md
index 5d67b67..4b65523 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,6 @@
 # misc
 misc code and stuff
+
+`pdf_renamer`: automatically extract the title from a PDF file (special case when working with files from ACL Anthology)
+
+`Madamira-Arapy`: forked from https://github.com/jordanking/arapy added compatibility with python 3.6+ and some new functions to return morphological attributes (only fixed the Madamira files)
diff --git a/pdf_renamer/monitor.ps1 b/pdf_renamer/monitor.ps1
index 07993a8..1ff965c 100644
--- a/pdf_renamer/monitor.ps1
+++ b/pdf_renamer/monitor.ps1
@@ -1,7 +1,7 @@
 ### SET FOLDER TO WATCH + FILES TO WATCH + SUBFOLDERS YES/NO
 $watcher = New-Object System.IO.FileSystemWatcher
 $watcher.Path = "C:\Users\WISSAM-PC\Downloads\Documents\"
-$watcher.Filter = "*.*"
+$watcher.Filter = "*.pdf"
 $watcher.IncludeSubdirectories = $true
 $watcher.EnableRaisingEvents = $true