diff --git a/.gitignore b/.gitignore
index b6e4761..30c3be0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
# Pyre type checker
.pyre/
+
+Madamira-Arapy/arapy/resources/
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/README.md b/Madamira-Arapy/arapy/README.md
new file mode 100644
index 0000000..169c63f
--- /dev/null
+++ b/Madamira-Arapy/arapy/README.md
@@ -0,0 +1,7 @@
+# Arapy
+Arabic text processing tools for python - A work in progress.
+
+# Dependencies
+gensim for word2vec: pip install gensim
+goslate for translation: pip install gensim
+madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/
diff --git a/Madamira-Arapy/arapy/__init__.py b/Madamira-Arapy/arapy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Madamira-Arapy/arapy/arapy.py b/Madamira-Arapy/arapy/arapy.py
new file mode 100644
index 0000000..c6f7b83
--- /dev/null
+++ b/Madamira-Arapy/arapy/arapy.py
@@ -0,0 +1,4 @@
+### Arapy module!
+
+from __future__ import absolute_import
+from __future__ import print_function
diff --git a/Madamira-Arapy/arapy/arwiki.py b/Madamira-Arapy/arapy/arwiki.py
new file mode 100644
index 0000000..c208f72
--- /dev/null
+++ b/Madamira-Arapy/arapy/arwiki.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Tools to parse arwiki dumps
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import arapy.normalization as norm
+import re
+import sys
+import codecs
+import xml.etree.cElementTree as etree
+
+def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False):
+ """
+ Reads in an unzipped arwiki dump.
+ Saves the text of the articles in a txt file with one sentence per line.
+ returns the name of the output file
+ """
+ # text tag that wiki uses to identify text content blocks
+ text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text'
+ junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|([]*>(.*?)])|(]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|()|()|()|()|()|(
)|()"
+ punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]"
+
+ with open(dump_in, 'r') as infile:
+ with open(dump_out, 'w') as outfile:
+
+ # iterate through the xml tree looking for tag starts
+ context = etree.iterparse(infile, events = ('start','end'))
+ context = iter(context)
+ event, root = context.next()
+
+ for event, elem in context:
+
+ # if the tag matches the wiki tag for text content, we extract the text
+ if event == 'end' and elem.tag == text_tag:
+
+ text = elem.text
+ #print(text)
+
+ # some text tags are empty
+ if text:
+
+ text = re.sub(junkPattern, '', text)
+
+ if remove_non_arabic:
+ text = norm.normalize_charset(text)
+
+ # move each sentence to a new line (rough regex)
+ if split_at_punc:
+ text = re.sub(r'[.!?]$', '\n', text)
+
+ text = re.sub(punctuationPattern, '', text)
+
+ for line in text.split('\n'):
+ if line.strip() != '':
+ outfile.write((line+'\n').encode('utf8'))
+
+ # keep memory free of previous branches of the xml tree
+ root.clear()
+
+ return dump_out
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/info/ResultsSummary.txt b/Madamira-Arapy/arapy/info/ResultsSummary.txt
new file mode 100644
index 0000000..3fb6b4b
--- /dev/null
+++ b/Madamira-Arapy/arapy/info/ResultsSummary.txt
@@ -0,0 +1,9 @@
+Current optimal parameterization for generating arabic word vectors (tested on wiki data):
+
+CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words
+
+Some work that I've read uses 100 dim, I think 200 is better for large data sets.
+
+In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this.
+
+Similarly, larger datasets may be able to take advantage of higher dimensional vectors.
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/info/accuracy-notes.xlsx b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx
new file mode 100644
index 0000000..82c16db
Binary files /dev/null and b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx differ
diff --git a/Madamira-Arapy/arapy/madamira.py b/Madamira-Arapy/arapy/madamira.py
new file mode 100644
index 0000000..8f32b3f
--- /dev/null
+++ b/Madamira-Arapy/arapy/madamira.py
@@ -0,0 +1,859 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Madamira output processing tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+from xml.etree.cElementTree import iterparse
+
+import arapy
+import arapy.normalization as norm
+import codecs
+import csv
+import numpy as np
+import os
+import re
+import requests
+import socket
+import io
+import subprocess
+import time
+
+MADAPORT = 8223
+#94223
+
+class Madamira:
+ url="http://localhost:" + str(MADAPORT)
+ headers = {'Content-Type': 'application/xml'}
+ xml_prefix="""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ \n"""
+ xml_seg_start = """\n"""
+ xml_seg_end = """\n\n"""
+ xml_suffix = """
+
+ """
+ config_prefix="{urn:edu.columbia.ccls.madamira.configuration:0.1}"
+
+ def __enter__(self):
+ self.start_server()
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.stop_server()
+
+ def start_server(self):
+ cwd = os.getcwd()
+ os.chdir(os.path.dirname(arapy.__file__)+"/resources/MADAMIRA-release-20170403-2.1/")
+
+ self.pid = subprocess.Popen(['java',
+ '-Xmx2500m',
+ '-Xms2500m',
+ '-XX:NewRatio=3',
+ '-jar',
+ 'MADAMIRA-release-20170403-2.1.jar',
+ '-s',
+ '-msaonly'])
+
+ print("Waiting for madamira to initialize.")
+ time.sleep(10)
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ result = sock.connect_ex(('localhost',MADAPORT))
+ while(result != 0):
+ sock.close()
+ time.sleep(1)
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ result = sock.connect_ex(('localhost',MADAPORT))
+
+ os.chdir(cwd)
+
+ self.session = requests.Session()
+
+ def stop_server(self):
+ self.session.close()
+ self.pid.kill()
+ print("Shut down MADAMIRA.")
+
+ def process(self, text):
+ """ Returns madamira xml output for a string input """
+
+ query = io.StringIO()
+ query.write(Madamira.xml_prefix)
+
+ for sentence in text:
+ query.write(Madamira.xml_seg_start)
+ query.write(sentence)
+ query.write(Madamira.xml_seg_end)
+
+ query.write(Madamira.xml_suffix)
+ query.seek(0)
+
+ response = self.session.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8'))
+
+ response.encoding = "utf8"
+
+ return MadamiraOutput(response.text)
+
+ def process_sentence(self, text):
+ """ Returns madamira xml output for a word string input """
+
+ query = io.StringIO()
+ query.write(Madamira.xml_prefix)
+
+ query.write(Madamira.xml_seg_start)
+ query.write(text)
+ query.write(Madamira.xml_seg_end)
+
+ query.write(Madamira.xml_suffix)
+ query.seek(0)
+
+ response = requests.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8'))
+
+ response.encoding = "utf8"
+
+ return [word for doc in MadamiraOutput(response.text).docs() for sent in doc.sentences() for word in sent.words()]
+
+class MadamiraOutput:
+ def __init__(self, xmltext):
+ self.xml = xmltext.encode("utf8")
+
+ def docs(self):
+ # madamira config prefix
+ mp=Madamira.config_prefix
+
+ # get an iterable TODO use raw string?
+ # wrapper = codecs.StreamReader(io.StringIO(self.xml), "utf8")
+ context = iterparse(io.BytesIO(self.xml), events=("start", "end"))
+
+ # turn it into an iterator
+ context = iter(context)
+
+ # get the root element
+ event, root = context.__next__()
+
+ for event, elem in context:
+
+ # parse each doc
+ if event == 'end' and elem.tag == mp+'out_doc':
+
+ yield MadamiraDoc(elem)
+
+ # don't keep the doc in memory
+ root.clear()#find(mp+'madamira_output').clear()
+
+class MadamiraDoc:
+ def __init__(self, elem):
+ self.elem = elem
+
+ def sentences(self):
+ mp = Madamira.config_prefix
+
+ for sentence in self.elem.iter(mp+'out_seg'):
+
+ yield MadamiraSentence(sentence)
+
+class MadamiraSentence:
+ def __init__(self, sentence):
+ self.sentence = sentence
+
+ def words(self):
+ mp = Madamira.config_prefix
+
+ for word in self.sentence.find(mp+'word_info').iter(mp+'word'):
+
+ yield MadamiraWord(word)
+
+ def chunks(self):
+ mp = Madamira.config_prefix
+
+ # should just be one segment_info per out_seg
+ # parse each chunk in segment, looking for noun phrases
+ for chunk in sentence.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+ yield MadamiraChunk(chunk)
+
+class MadamiraWord:
+ """
+ Use the get_attribute function to return a selected attribute
+
+ Example XML Word:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ svm_predictions_attribute_names_ = ['diac', 'lemma', 'pos', 'prc3', 'prc2', 'prc1', 'prc0', 'per', 'asp', 'vox', 'mod', 'gen', 'num', 'stt', 'cas', 'enc0']
+
+ def __init__(self, word):
+ self.word = word
+
+
+
+ def lemma(self):
+ mp = Madamira.config_prefix
+
+ # grab the lemma data
+ lemma = self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma')
+
+ # strip down to the arabic script
+ if not lemma:
+ return ""
+ elif len(lemma) == 0:
+ return ""
+ else:
+ norm_lemma = norm.normalize_charset(lemma).strip()
+ if len(norm_lemma) == 0:
+ return lemma
+ else:
+ return norm_lemma
+
+ def get_orig_word(self):
+ mp = Madamira.config_prefix
+
+ return self.word.attrib['word']
+
+
+ def pos(self):
+ mp = Madamira.config_prefix
+
+ return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('pos')
+
+ def pos_gender(self):
+ mp = Madamira.config_prefix
+
+ return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('gen')
+
+ def get_attribute(self, attribute=''):
+ mp = Madamira.config_prefix
+ return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get(attribute)
+
+ def tokens(self):
+ mp = Madamira.config_prefix
+
+ # grab the lemma data
+ tokens = []
+ for token in self.word.find(mp+"tokenized[@scheme='MyD3']").iter(mp+'tok'):
+ tokens.append(token.get('form0'))
+
+ return tokens
+
+
+class MadamiraChunk:
+ def __init__(self, chunk):
+ self.chunk = chunk
+
+ def type(self):
+ return self.get('type')
+
+ # def tokens():
+ # # combine tokens into phrase
+ # for tok in chunk.iter(mp+'tok'):
+ # segment = tok.get('form0')
+
+ # if segment[-1] == '+':
+ # noun_phrase += segment[:-1]
+ # elif segment[0] == '+':
+ # # if it is a suffix prep, attach it to prev token
+ # if len(noun_phrase) > 0:
+ # noun_phrase = noun_phrase[:-1] + segment[1:]
+ # else:
+ # noun_phrase = segment[1:]
+ # else:
+ # noun_phrase += segment + '_'
+
+ # # drop the last underscore and add to the np sentence
+ # if noun_phrase[-1] == '_':
+ # noun_phrase = noun_phrase[:-1]
+ # sent += noun_phrase+' '
+
+
+
+def transform_sentence_file(sentence_file,
+ lemmaout="lemmas.txt",
+ tokenout="token.txt",
+ posout="pos.txt",
+ lemmas=True,
+ pos=False,
+ tokens=False):
+ """returns filenames of lemmas and pos files"""
+ with Madamira() as m:
+
+ # open output files
+ lemma_out = None
+ lemma_buff = None
+ if lemmas:
+ lemma_buff = io.StringIO()
+ lemma_file = lemmaout
+ lemma_out = open(lemma_file, 'w')
+
+ pos_out = None
+ pos_buff = None
+ if pos:
+ pos_buff = io.StringIO()
+ pos_file = posout
+ pos_out = open(pos_file, 'w')
+
+ token_out = None
+ token_buff = None
+ if tokens:
+ token_buff = io.StringIO()
+ token_file = tokenout
+ token_out = open(token_file, 'w')
+
+ # read files into a list, or buffer the sentences one at a time, of sentences
+ # sentence_list = open(sentence_file).read().splitlines()
+ with open(sentence_file, 'r') as sentences:
+ for sentence in sentences:
+
+ out = m.process([sentence])
+
+ for doc in out.docs():
+ for sent in doc.sentences():
+
+ for word in sent.words():
+ if lemmas:
+ lemma_buff.write(word.lemma())
+ lemma_buff.write(" ")
+ if pos:
+ pos_buff.write(word.pos())
+ pos_buff.write(" ")
+ if tokens:
+ for token in word.tokens():
+ token_buff.write(token)
+ token_buff.write(" ")
+
+ # for chunk in sent.chunks()
+ # if tokens:
+ # token_list = word.tokens()
+ # for token in token_list:
+ # token_buff.write(token) # TODO
+ # token_buff.write(" ")
+
+ if lemmas:
+
+ lemma_buff.seek(0)
+ lemma_out.write(lemma_buff.read().rstrip().encode('utf8'))
+ lemma_out.write('\n')
+ lemma_buff.close()
+ lemma_buff = io.StringIO()
+
+ if pos:
+ pos_buff.seek(0)
+ pos_out.write(pos_buff.read().rstrip().encode('utf8'))
+ pos_out.write('\n')
+ pos_buff.close()
+ pos_buff = io.StringIO()
+
+ if tokens:
+ token_buff.seek(0)
+ token_out.write(token_buff.read().rstrip().encode('utf8'))
+ token_out.write('\n')
+ token_buff.close()
+ token_buff = io.StringIO()
+
+ if lemmas:
+ lemma_buff.close()
+ lemma_out.close()
+ if pos:
+ pos_buff.close()
+ pos_out.close()
+ if tokens:
+ token_buff.close()
+ token_out.close()
+
+
+ return [lemma_file, pos_file, token_file]
+
+
+
+
+# def save_lemmatization(xml_mada_fn, out_fn):
+# """
+# Saves a lemmatization from a madamira xml output file
+# """
+
+# # open the output file
+# outfile = codecs.open(out_fn, 'w', "utf-8")
+
+# # madamira config prefix
+# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+# # get an iterable
+# context = iterparse(xml_mada_fn, events=("start", "end"))
+
+# # turn it into an iterator
+# context = iter(context)
+
+# # get the root element
+# event, root = context.next()
+
+# for event, elem in context:
+
+# # parse each sentence
+# if event == 'end' and elem.tag == mp+'out_seg':
+
+# # construct the sentence, then write once per sentence
+# sent = ''
+
+# # should just be one word_info per out_seg
+# # parse each word in word_info
+# for word in elem.find(mp+'word_info').iter(mp+'word'):
+
+# # grab the lemma data
+# lemma = word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma')
+
+# # strip all but arabic script TODO
+# lemma = lemma.split('_')[0]
+
+# # normalize the script
+# lemma = norm.normalize(lemma)
+
+# sent += lemma
+# sent += ' '
+
+# # write the sentence out (without last space)
+# outfile.write(sent[:-1]+'\n')
+
+# # don't keep the sentence in memory
+# root.find(mp+'out_doc').clear()
+
+# elif event == 'end' and elem.tag == mp+'out_doc':
+# outfile.write('#ENDDOC#\n')
+
+# def save_noun_phrases(xml_mada_fn, out_fn):
+# """
+# Saves noun phrases from a madamira xml output file
+# """
+
+# # open the output file
+# outfile = codecs.open(out_fn, 'w', "utf-8")
+
+# # madamira config prefix
+# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+# # get an iterable
+# context = iterparse(xml_mada_fn, events=("start", "end"))
+
+# # turn it into an iterator
+# context = iter(context)
+
+# # get the root element
+# event, root = context.next()
+
+# for event, elem in context:
+
+# # parse each sentence
+# if event == 'end' and elem.tag == mp+'out_seg':
+
+# # construct the sentence, then write once per sentence
+# sent = ''
+
+# # should just be one segment_info per out_seg
+# # parse each chunk in segment, looking for noun phrases
+# for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+
+# # identify noun phrases
+# if chunk.get('type') == 'NP':
+
+# # we build noun phrases with underscores between words
+# noun_phrase = ''
+
+# # combine tokens into phrase
+# for tok in chunk.iter(mp+'tok'):
+# segment = tok.get('form0')
+
+# if segment[-1] == '+':
+# noun_phrase += segment[:-1]
+# elif segment[0] == '+':
+# # if it is a suffix prep, attach it to prev token
+# if len(noun_phrase) > 0:
+# noun_phrase = noun_phrase[:-1] + segment[1:]
+# else:
+# noun_phrase = segment[1:]
+# else:
+# noun_phrase += segment + '_'
+
+# # drop the last underscore and add to the np sentence
+# if noun_phrase[-1] == '_':
+# noun_phrase = noun_phrase[:-1]
+# sent += noun_phrase+' '
+
+# # write the noun phrase sentence out (without last space)
+# outfile.write(sent[:-1]+'\n')
+
+# # don't keep the segment in memory
+# root.find(mp+'out_doc').clear()
+
+# elif event == 'end' and elem.tag == mp+'out_doc':
+# outfile.write('#ENDDOC#\n')
+
+# def save_noun_phrase_graph(xml_mada_fn, out_fn, window = 5):
+# """
+# TODO figure out how expensive this is, implement in scala/spark next
+# Saves a noun phrase graph from a madamira xml output file
+# """
+
+# # edges (nodeid, nodeid, dist)
+# edges = []
+
+# # vertices (long hash(str) : str (noun phrase))
+# vertices = {}
+
+# # mentions list
+# mentions_list = []
+
+# # distance to add edges at
+# distance = 10
+
+# # open the output file
+# outfile = codecs.open(out_fn, 'w', "utf-8")
+
+# # madamira config prefix
+# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}'
+
+# # get an iterable
+# context = iterparse(xml_mada_fn, events=("start", "end"))
+
+# # turn it into an iterator
+# context = iter(context)
+
+# # get the root element
+# event, root = context.next()
+
+# # document token tracking
+# tokens_so_far = 0
+
+# for event, elem in context:
+
+# # parse each sentence
+# if event == 'end' and elem.tag == mp+'out_seg':
+
+# # should just be one segment_info per out_seg
+# # parse each chunk in segment, looking for noun phrases
+# for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'):
+
+# # identify noun phrases
+# if chunk.get('type') == 'NP':
+
+# # we build noun phrases with underscores between words
+# noun_phrase = ''
+
+# # noun phrase starts on next token
+# noun_phrase_start = tokens_so_far + 1
+
+# # combine tokens into phrase
+# for tok in chunk.iter(mp+'tok'):
+
+# tokens_so_far += 1
+
+# segment = tok.get('form0')
+
+# # builds phrase
+# if segment[-1] == '+':
+# noun_phrase += segment[:-1]
+# elif segment[0] == '+':
+# # if it is a suffix prep, attach it to prev token
+# if len(noun_phrase) > 0:
+# noun_phrase = noun_phrase[:-1] + segment[1:]
+# else:
+# noun_phrase = segment[1:]
+# else:
+# noun_phrase += segment + '_'
+
+# # drop the last underscore and add to the np sentence
+# noun_phrase = noun_phrase.strip('_')
+
+# # noun phrase ended on last token
+# noun_phrase_end = tokens_so_far
+
+# np_hash = hash(noun_phrase)
+# vertices[np_hash] = noun_phrase
+
+# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+
+# else:
+# for tok in chunk.iter(mp+'tok'):
+# tokens_so_far += 1
+
+# # don't keep the segment in memory
+# root.find(mp+'out_doc').clear()
+
+# elif event == 'end' and elem.tag == mp+'out_doc':
+# # add edges from last document
+# for start in range(0, len(mentions_list) - 10):
+# end = start + 10
+# head = start
+# tail = start + 1
+# in_range = True
+# while in_range:
+# if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance:
+# if (mentions_list[tail][1]-mentions_list[head][2] > 0):
+# dist = mentions_list[tail][1]-mentions_list[head][2]
+# else:
+# dist = 0
+
+# edges.append([mentions_list[head][0], mentions_list[tail][0], dist])
+# edges.append([mentions_list[tail][0], mentions_list[head][0], dist])
+
+# tail += 1
+
+# else:
+# in_range = False
+
+# np.savetxt("edges.csv", np.array(edges), delimiter=",", fmt='%i')
+# writer = csv.writer(open('vertices.csv','wb'))
+# for key, value in vertices.items():
+# writer.writerow([key, value.encode('utf-8')])
+
+# def raw_save_lemmatization(raw_mada_fn, out_fn):
+# """
+# Saves a lemmatization from a madamira raw output file
+# """
+# mada = codecs.open(mada_fn, 'r', "utf-8")
+
+# p = re.compile(r'lex:[^\s_]+', re.UNICODE)
+# start_of_line = True
+
+# with codecs.open(out_fn, 'w', 'utf-8') as outfile:
+# for line in mada:
+
+# if line == 'SENTENCE BREAK\n':
+# outfile.write('\n')
+# start_of_line = True
+# elif line.startswith('*'):
+# m = p.findall(line)
+# if m:
+# print(m)
+# if not start_of_line:
+# outfile.write(' ')
+# outfile.write(norm.normalize(m[0][4:]))
+# start_of_line = False
+
+# def raw_save_noun_phrase_graph(raw_bpcbio_fn, out_fn, window = 5):
+# """
+# TODO figure out how expensive this is, implement in scala/spark next
+# Saves a noun phrase graph from a madamira raw output file
+# """
+
+# # edges (nodeid, nodeid, dist)
+# edges = []
+
+# # vertices (long hash(str) : str (noun phrase))
+# vertices = {}
+
+# # mentions list
+# mentions_list = []
+
+# # distance to add edges at
+# distance = 10
+
+# # open the output file
+# outfile = codecs.open(out_fn, 'w', "utf-8")
+
+# # get an iterable
+# context = codecs.open(raw_bpcbio_fn, 'r', 'utf-8')
+
+# # document token tracking
+# tokens_so_far = 0
+
+# # keep track of NP construction
+# noun_phrase = ''
+# noun_phrase_start = 0
+# noun_phrase_end = 0
+
+# # count sentences if not broken into docs
+# sentence_idx = 1
+
+# # count docs
+# doc_count = 1
+
+# with codecs.open('raw_edges.csv', 'a') as edge_file:
+# with codecs.open(raw_bpcbio_fn, 'r', 'utf-8') as context:
+# while True:
+# line = context.readline()
+
+# if not line or sentence_idx % 100 == 0:
+
+# sentence_idx = 1
+# print('Doc: ', doc_count, 'Vertices: ', len(vertices))
+# doc_count += 1
+
+# if noun_phrase != '':
+# noun_phrase = noun_phrase.strip('_')
+# noun_phrase_end = tokens_so_far - 1
+# np_hash = hash(noun_phrase)
+# vertices[np_hash] = noun_phrase
+# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+# noun_phrase = ''
+
+# # edge_count = 0
+
+# # add edges from last document
+# for start in range(0, len(mentions_list) - 10):
+# # end = start + 10
+# head = start
+# tail = start + 1
+# in_range = True
+# while in_range and tail < len(mentions_list):
+# if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance:
+# if (mentions_list[tail][1]-mentions_list[head][2] > 0):
+# dist = mentions_list[tail][1]-mentions_list[head][2]
+# else:
+# dist = 0
+
+# edges.append([mentions_list[head][0], mentions_list[tail][0], dist])
+# edges.append([mentions_list[tail][0], mentions_list[head][0], dist])
+
+# # edge_count += 2
+# tail += 1
+
+# else:
+# in_range = False
+
+# # print('Adding: ', edge_count, ' edges.')
+
+# np.savetxt(edge_file, np.array(edges), delimiter=",", fmt='%i')
+
+
+# edges = []
+# mentions_list = []
+
+# if not line:
+# writer = csv.writer(open('raw_vertices.csv','wb'))
+# for key, value in vertices.items():
+# writer.writerow([key, value.encode('utf-8')])
+# break
+
+
+# if line.strip() == "":
+
+# # end of sentence
+# sentence_idx += 1
+
+# else:
+
+# tokens_so_far += 1
+
+# text, bpc_type = line.strip().split("\t")
+
+# if (bpc_type == 'B-NP' or bpc_type != 'I-NP') and noun_phrase != '':
+
+# noun_phrase = noun_phrase.strip('_')
+# noun_phrase_end = tokens_so_far - 1
+# np_hash = hash(noun_phrase)
+# vertices[np_hash] = noun_phrase
+# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end])
+# noun_phrase = ''
+
+# if bpc_type == 'B-NP':
+
+# # noun phrase starts on this token
+# noun_phrase_start = tokens_so_far
+
+# if text[-1] == '+':
+# noun_phrase += text[:-1]
+# elif text[0] == '+':
+# noun_phrase = text[1:]
+# else:
+# noun_phrase += text + '_'
+
+# elif bpc_type == 'I-NP':
+
+# if text[-1] == '+':
+# noun_phrase += text[:-1]
+# elif text[0] == '+':
+# # if it is a suffix prep, attach it to prev token
+# if len(noun_phrase) > 0:
+# noun_phrase = noun_phrase[:-1] + text[1:]
+# else:
+# noun_phrase = text[1:]
+# else:
+# noun_phrase += text + '_'
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/normalization.py b/Madamira-Arapy/arapy/normalization.py
new file mode 100644
index 0000000..544c05a
--- /dev/null
+++ b/Madamira-Arapy/arapy/normalization.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic script normalization tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import re
+import codecs
+
+# regex for arabic chars
+inv_arabic_charset = re.compile(r'[^\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\u0030-\u0039\n\.]+', re.UNICODE)
+
+def normalize(text, ar_only=True, digits=False, alif=True, hamza=True, yaa=True, tashkil=True):
+ """
+ Normalizes arabic text
+ Removes non-arabic chars by default
+ Changes all numerals to # if digits is true, default false
+ Normalizes alif, hamza, and yaa by default
+ Removes supplementary diacritics
+ """
+ if ar_only:
+ text = normalize_charset(text)
+
+ if digits:
+ text = normalize_digits(text)
+
+
+ if alif:
+ text = normalize_alif(text)
+ if hamza:
+ text = normalize_hamza(text)
+ if yaa:
+ text = normalize_yaa(text)
+
+ if tashkil:
+ text = remove_tashkil(text)
+
+ return text
+
+def normalize_sentence_file(sentence_file, outfile_path="normal.txt", ar_only=True, digits=True, alif=True, hamza=True, yaa=True, tashkil=True):
+ """
+ Normalizes a file of sentences and saves to a file w/ parameterized naming scheme
+ returns the outfile name
+ """
+
+ # outfile_path = (sentence_file.split('.')[0]+
+ # "_ar_only"+str(ar_only)+
+ # "_digits"+str(digits)+
+ # "_alif"+str(alif)+
+ # "_hamza"+str(hamza)+
+ # "_yaa"+str(yaa)+
+ # "_tashkil"+str(tashkil)+
+ # ".txt")
+
+ with open(sentence_file, 'r') as infile:
+ with open(outfile_path, 'w') as outfile:
+ for text in infile:
+ text = text.decode('utf8')
+
+ text = normalize(text, ar_only=ar_only, digits=digits, alif=alif, hamza=hamza, yaa=yaa, tashkil=tashkil)
+
+ if text:
+ outfile.write(text.encode('utf8'))
+
+ return outfile_path
+
+def remove_tashkil(text):
+ """ removes set of arabic supplementary diacritics """
+ text = remove_harakat(text)
+ text = remove_tanwin(text)
+ text = remove_shaddah(text)
+ text = remove_kashida(text)
+ return text
+
+#####################
+### Normalization ###
+#####################
+
+def normalize_charset(text):
+ return inv_arabic_charset.sub(' ', text)
+
+def normalize_digits(text):
+ """ replaces all forms of numbers with # """
+ return re.sub(r'[0123456789٠١٢٣٤٥٦٧٨٩]', r'#', text)
+
+def normalize_alif(text):
+ """ replaces all forms of alif with ا """
+ return re.sub(r'[إأٱآا]', r'ا', text)
+
+def normalize_yaa(text):
+ """ replaces ى with ي """
+ return re.sub(r'ى', r'ي', text)
+
+def normalize_hamza(text, normalize_alif = False):
+ """
+ replaces hamza on seats with ء
+ does not include alif seats by default
+ set normalize_alif=True to replace إأ with hamza
+ """
+ if normalize_alif:
+ return re.sub(r'[ؤئإأ]', r'ء', text)
+ else:
+ return re.sub(r'[ؤئ]', r'ء', text)
+
+#######################
+### Tashkil removal ###
+#######################
+
+def remove_harakat(text):
+ """
+ removes short vowel marks
+ does not normalize alif forms
+ does not remove tanwin (ًٌٍ) (use remove_tanwin)
+ """
+ return re.sub(r'[َُِْ]', r'', text)
+
+def remove_tanwin(text):
+ """
+ removes tanwin vowel marks
+ does not normalize alif forms
+ """
+ return re.sub(r'[ًٌٍ]', r'', text)
+
+def remove_shaddah(text):
+ """
+ removes the shaddah mark (tashdid)
+ """
+ return re.sub(r'[ّ]', r'', text)
+
+def remove_kashida(text):
+ """
+ removes the kashida elongation mark (tatwil)
+ """
+ return re.sub(r'[ـ]', r'', text)
+
+ text = re.sub(noise, '', text)
+ return text
+
+##################################
+### Buckwalter transliteration ###
+##################################
+
+def unicode_to_bw(string, reverse=0):
+ """
+ Given a Unicode string, transliterate into Buckwalter.
+ To go from Buckwalter back to Unicode, set reverse=1.
+ Partially taken from https://github.com/andyroberts/buckwalter2unicode
+ """
+
+ buck2uni = {"'": u"\u0621", # hamza-on-the-line
+ "|": u"\u0622", # madda
+ ">": u"\u0623", # hamza-on-'alif
+ "&": u"\u0624", # hamza-on-waaw
+ "<": u"\u0625", # hamza-under-'alif
+ "}": u"\u0626", # hamza-on-yaa'
+ "A": u"\u0627", # bare 'alif
+ "b": u"\u0628", # baa'
+ "p": u"\u0629", # taa' marbuuTa
+ "t": u"\u062A", # taa'
+ "v": u"\u062B", # thaa'
+ "j": u"\u062C", # jiim
+ "H": u"\u062D", # Haa'
+ "x": u"\u062E", # khaa'
+ "d": u"\u062F", # daal
+ "*": u"\u0630", # dhaal
+ "r": u"\u0631", # raa'
+ "z": u"\u0632", # zaay
+ "s": u"\u0633", # siin
+ "$": u"\u0634", # shiin
+ "S": u"\u0635", # Saad
+ "D": u"\u0636", # Daad
+ "T": u"\u0637", # Taa'
+ "Z": u"\u0638", # Zaa' (DHaa')
+ "E": u"\u0639", # cayn
+ "g": u"\u063A", # ghayn
+ "_": u"\u0640", # taTwiil
+ "f": u"\u0641", # faa'
+ "q": u"\u0642", # qaaf
+ "k": u"\u0643", # kaaf
+ "l": u"\u0644", # laam
+ "m": u"\u0645", # miim
+ "n": u"\u0646", # nuun
+ "h": u"\u0647", # haa'
+ "w": u"\u0648", # waaw
+ "Y": u"\u0649", # 'alif maqSuura
+ "y": u"\u064A", # yaa'
+ "F": u"\u064B", # fatHatayn
+ "N": u"\u064C", # Dammatayn
+ "K": u"\u064D", # kasratayn
+ "a": u"\u064E", # fatHa
+ "u": u"\u064F", # Damma
+ "i": u"\u0650", # kasra
+ "~": u"\u0651", # shaddah
+ "o": u"\u0652", # sukuun
+ "`": u"\u0670", # dagger 'alif
+ "{": u"\u0671", # waSla
+ }
+
+ # For a reverse transliteration (Unicode -> Buckwalter), a dictionary
+ # which is the reverse of the above buck2uni is essential.
+
+ uni2buck = {}
+
+ # Iterate through all the items in the buck2uni dict.
+ for (key, value) in buck2uni.iteritems():
+ # The value from buck2uni becomes a key in uni2buck, and vice
+ # versa for the keys.
+ uni2buck[value] = key
+
+ if not reverse:
+ for k,v in buck2uni.iteritems():
+ string = string.replace(v,k)
+
+ else:
+ for k,v in buck2uni.iteritems():
+ string = string.replace(k,v)
+
+ return string
+
+
diff --git a/Madamira-Arapy/arapy/thesaurus.py b/Madamira-Arapy/arapy/thesaurus.py
new file mode 100644
index 0000000..b1426db
--- /dev/null
+++ b/Madamira-Arapy/arapy/thesaurus.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word thesaurus simulation tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import arapy.translate as trans
+import logging
+import sys
+import json
+import requests
+import urllib2
+
+
+API_KEY = '80901dbb851efc07b4bd747ba3ead0ae' # API key is available from here - http://words.bighugelabs.com/getkey.php
+URL_MASK = 'http://words.bighugelabs.com/api/2/{1}/{0}/json'
+RELATIONSHIP_ABBR = {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'}
+
+def thesaurus(word, relation=None, ngram=0, ar=False, target_result_count=0):
+ """
+ Uses bighugelabs thesaurus API
+ requires the API key available http://words.bighugelabs.com/getkey.php
+
+ Translation is done with google translate in the translate module. Requires google api key
+
+ Takes in a word and retreives a list of related words where
+ the relationship is given by one key from {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'}
+ the words are filtered by ngram, 0 for all
+ if ar = 0, the word is translated before and after from arabic
+ target_result_count is the number of words to return with
+
+ returns a dictionary where keys are the requested relationships, and values are lists of ngrams matching that relationship
+ returns empty dictionary if the thesaurus didn't have any results
+ """
+
+ gs = None
+ if ar:
+ translations = trans.translate_list([word], 'en', 'ar')
+ if len(translations) > 0:
+ word = translations[0]
+ else:
+ logging.info("Couldn't translate word: "+str(word)+" to english")
+ return {}
+
+ if not word:
+ logging.info("Translated word is empty.")
+ return {}
+
+ # format and make the request
+ url = URL_MASK.format(urllib2.quote(word.encode('utf-8')), API_KEY)
+ result = requests.get(url)
+
+ if not result.text:
+ logging.info("Thesaurus had no info for word:"+word.encode('utf-8'))
+ return {}
+
+ json_result = json.loads(result.text)
+
+ # our relationship dictionary
+ words = {}
+ word_count = 0
+
+ # for each sense of the word
+ for pos in json_result:
+
+ # we want only the requested relations
+ for rel in json_result[pos]:
+ if relation == None or relation == rel:
+
+ # each word matching the relationship
+ for w in json_result[pos][rel]:
+
+ candidate = w
+
+ # we only want so many results
+ if target_result_count == 0 or word_count < target_result_count:
+
+ if ar == 1:
+ translations = trans.translate_list([candidate],'ar','en')
+ if len(translations) > 0:
+ candidate = translations[0]
+ else:
+ logging.info("Couldn't translate word: "+str(candidate)+" to arabic")
+ return {}
+
+ if ngram == 0 or len(candidate.split(" ")) == ngram:
+ if not rel in words:
+ words[rel] = []
+ words[rel].append(candidate)
+ word_count+=1
+
+ else:
+ # we have enough results
+ return words
+ return words
+
+
+
diff --git a/Madamira-Arapy/arapy/translate.py b/Madamira-Arapy/arapy/translate.py
new file mode 100644
index 0000000..85ca3b1
--- /dev/null
+++ b/Madamira-Arapy/arapy/translate.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word translation tools for google translate api: https://cloud.google.com/translate/v2/pricing
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import logging
+import sys
+import json
+import requests
+import urllib2
+
+
+GOOGLE_API_KEY = 'AIzaSyAAScZ3-Ut-1sxn5gsSLzxzXJgzn3jGsN4'
+URL_MASK = 'https://www.googleapis.com/language/translate/v2?key={0}{1}&source={2}&target={3}'
+
+def translate_list(words, target='ar', source='en'):
+ """
+ Translates a word with the google translate api
+ Requires an API key from the google developers console
+ Target is the language target, ie 'ar'
+ """
+
+ # format the words for the url
+ formatted_words=""
+ for word in words:
+ formatted_words += "&q=" + urllib2.quote(word)
+
+ # format the url for the get
+ url = URL_MASK.format(GOOGLE_API_KEY, formatted_words, source, target)
+ result = requests.get(url)
+
+ if not result.text:
+ logging.info("Google responded with no translations, check api key.")
+ return []
+
+ json_result = json.loads(result.text)
+
+ if not 'data' in json_result:
+ logging.info("Google result had no data element, check api key.")
+ return []
+
+ # parse the result
+ translations = []
+ for translation in json_result['data']['translations']:
+ trans_word = translation['translatedText']
+ logging.info("Translated "+str(word)+ " to "+trans_word.encode('utf-8'))
+ translations.append(trans_word)
+
+ return translations
\ No newline at end of file
diff --git a/Madamira-Arapy/arapy/word2vec.py b/Madamira-Arapy/arapy/word2vec.py
new file mode 100644
index 0000000..f65898d
--- /dev/null
+++ b/Madamira-Arapy/arapy/word2vec.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Arabic word embedding tools
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+import logging
+import sys
+
+def train_embeddings(infile, outfile = "embedding.txt", sg=1, size=100, seed = 0, window=8, min_count=5,
+ sample=1e-4, hs=0, negative=25, iterations=15):
+ """
+ Saves the model to a file with the parameters in the name.
+ All of these functions work on any language of corpora
+ Uses gensim's training parameters:
+
+ Initialize the model from an iterable of `sentences`. Each sentence is a
+ list of words (unicode strings) that will be used for training.
+
+ The `sentences` iterable can be simply a list, but for larger corpora,
+ consider an iterable that streams the sentences directly from disk/network.
+ See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in
+ this module for such examples.
+
+ If you don't supply `sentences`, the model is left uninitialized -- use if
+ you plan to initialize it in some other way.
+
+ `sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. Otherwise, `cbow` is employed.
+
+ `size` is the dimensionality of the feature vectors.
+
+ `window` is the maximum distance between the current and predicted word within a sentence.
+
+ `alpha` is the initial learning rate (will linearly drop to zero as training progresses).
+
+ `seed` = for the random number generator. Initial vectors for each
+ word are seeded with a hash of the concatenation of word + str(seed).
+
+ `min_count` = ignore all words with total frequency lower than this.
+
+ `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
+ default is 0 (off), useful value is 1e-5.
+
+ `workers` = use this many worker threads to train the model (=faster training with multicore machines).
+
+ `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
+
+ `negative` = if > 0, negative sampling will be used, the int for negative
+ specifies how many "noise words" should be drawn (usually between 5-20).
+
+ `cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
+ Only applies when cbow is used.
+
+ `hashfxn` = hash function to use to randomly initialize weights, for increased
+ training reproducibility. Default is Python's rudimentary built in hash function.
+
+ `iter` = number of iterations (epochs) over the corpus.
+ """
+ # set up logging
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
+
+ # files are iterated over with this object
+ # class MySentences(object):
+ # def __init__(self, fname):
+ # self.fname = fname
+ # self.errors = 0
+
+ # def __iter__(self):
+ # for line in open(self.fname):
+ # yield line.split()
+
+ # sentences = MySentences(infile)
+ sentences = LineSentence(infile)
+
+
+ model = Word2Vec(sentences,
+ sg = sg,
+ size = size,
+ window = window,
+ min_count = min_count,
+ hs = hs,
+ workers = 4,
+ sample = sample,
+ seed = seed,
+ negative = negative,
+ iter = iterations)
+
+ model.save_word2vec_format(outfile, binary = True)
+
+ return outfile
+
+def start_interactive_test_suite():
+ """
+ Loads a model, then allows interactive tests of:
+ ac - not interactive, rather loads an analogy file and outputs the results
+ one word most similar queries
+ two word similarity measures
+ three word analogy queries
+ four+ word odd one out queries
+ """
+
+ output_spacing = 25
+
+ modelfile = raw_input('Please enter the binary model file path: ')# (or gn/en/ar): ')
+ modelfile = modelfile.strip().strip('\'')
+
+ # if modelfile == 'gn':
+ # modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin'
+ # elif modelfile == 'ar':
+ # modelfile = '/Users/king96/Documents/Word2Vec/Models/ar_wiki_seg_vecs.bin'
+ # elif modelfile == 'en':
+ # modelfile = '/Users/king96/Documents/Word2Vec/Models/en_wiki_vecs.bin'
+
+ # set up logging
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+ level=logging.INFO)
+
+ # load model
+ model = Word2Vec.load_word2vec_format(modelfile, binary=True)
+
+ while True:
+
+ # offer the menu
+ print('\n')
+ print('Type ac to run accuracy tests.')
+ print('Enter one word for neighbors, two for distance,')
+ print('three for analogy, more for matching, q to quit.')
+ words = raw_input('Word: ')
+
+ words = words.decode('UTF-8', 'replace')
+
+ if words == 'q':
+ break
+
+ if words == 'ac':
+ print('Please enter the questions file to test on:')
+
+ questions = raw_input('File: ').strip()
+
+ model.accuracy(questions, restrict_vocab = 30000, tries = 5)
+ continue
+
+ # the remaining options take 0 < n query words
+ words = words.split(' ')
+
+ if len(words) == 0:
+ continue
+
+ # top 10 words
+ elif len(words) == 1:
+ try:
+ candidates = model.most_similar(words[0], topn=10)
+ print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing))
+ for word in candidates:
+ print(str(word[0].encode('UTF-8','replace')).rjust(output_spacing),
+ str(word[1]).rjust(output_spacing))
+ except KeyError as ke:
+ print(ke.message.encode('utf-8','replace'))
+
+
+ # pair similarity
+ elif len(words) == 2:
+ try:
+ print('Similarity is : ' + str(model.similarity(words[0],words[1])))
+ except KeyError as ke:
+ print(ke.message.encode('utf-8','replace'))
+
+ # analogy
+ elif len(words) == 3:
+ try:
+ candidates = model.most_similar(positive=[words[2], words[1]],
+ negative = [words[0]],
+ topn=10)
+
+ print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing))
+ for word in candidates:
+ print(str(word[0].encode('UTF-8', 'replace')).rjust(output_spacing),
+ str(word[1]).rjust(output_spacing))
+ except KeyError as ke:
+ print(ke.message.encode('utf-8','replace'))
+
+ # odd one out
+ else:
+ try:
+ print('Odd one out: ' + str(model.doesnt_match(words).encode('utf-8', 'replace')))
+ except KeyError as ke:
+ print(ke.message.encode('utf-8','replace'))
+
+def start_query_expander():
+ modelfile = raw_input('Please enter the binary model file path: ')
+ modelfile = modelfile.strip()
+
+ # if modelfile == 'gn':
+ # modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin'
+
+ # set up logging
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+ level=logging.INFO)
+
+ # load model
+ model = Word2Vec.load_word2vec_format(modelfile, binary=True)
+
+ while True:
+
+ words = raw_input('\nEnter words to expand, q to quit: ')
+
+ words = words.decode('UTF-8', 'replace')
+
+ if words == 'q':
+ break
+
+ words = words.split(' ')
+
+ if len(words) == 0:
+ continue
+
+ # top 10 words
+ else:
+ expansion = set()
+
+ for word in words:
+ try:
+ expansion = expansion | set([x[0] for x in model.most_similar(word, topn=10)])
+ except KeyError as ke:
+ print(ke.message.encode('utf-8','replace'))
+
+ print('Expansion')
+ for word in expansion:
+ print(str(word.encode('UTF-8','replace')))
\ No newline at end of file
diff --git a/Madamira-Arapy/test_madamira.py b/Madamira-Arapy/test_madamira.py
new file mode 100644
index 0000000..8981ed3
--- /dev/null
+++ b/Madamira-Arapy/test_madamira.py
@@ -0,0 +1,17 @@
+#%%
+from arapy.madamira import Madamira
+
+#%%
+text = "ما هي صفات السبعين ألفا الذين يدخلون الجنة بغير حساب"
+
+with Madamira() as m:
+ out = m.process([text])
+
+# %%
+for doc in out.docs():
+ for sent in doc.sentences():
+ for word in sent.words():
+ print(word.get_orig_word(),": ", word.pos(),"--",word.get_attribute('gen'),"--",word.get_attribute('per'))
+
+
+# %%
diff --git a/README.md b/README.md
index 5d67b67..4b65523 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,6 @@
# misc
misc code and stuff
+
+`pdf_renamer`: automatically extract the title from a PDF file (special case when working with files from ACL Anthology)
+
+`Madamira-Arapy`: forked from https://github.com/jordanking/arapy added compatibility with python 3.6+ and some new functions to return morphological attributes (only fixed the Madamira files)
diff --git a/pdf_renamer/monitor.ps1 b/pdf_renamer/monitor.ps1
index 07993a8..1ff965c 100644
--- a/pdf_renamer/monitor.ps1
+++ b/pdf_renamer/monitor.ps1
@@ -1,7 +1,7 @@
### SET FOLDER TO WATCH + FILES TO WATCH + SUBFOLDERS YES/NO
$watcher = New-Object System.IO.FileSystemWatcher
$watcher.Path = "C:\Users\WISSAM-PC\Downloads\Documents\"
-$watcher.Filter = "*.*"
+$watcher.Filter = "*.pdf"
$watcher.IncludeSubdirectories = $true
$watcher.EnableRaisingEvents = $true