added Madamira Arapy

WissamAntoun · Jul 27, 2020 · 3f0c5bc · 3f0c5bc
1 parent adfad82
commit 3f0c5bc
Show file tree

Hide file tree

Showing 15 changed files with 1,573 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+Madamira-Arapy/arapy/resources/
diff --git a/Madamira-Arapy/arapy/README.md b/Madamira-Arapy/arapy/README.md
@@ -0,0 +1,7 @@
+# Arapy
+Arabic text processing tools for python - A work in progress.
+
+# Dependencies
+gensim for word2vec: pip install gensim
+goslate for translation: pip install gensim
+madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/
diff --git a/Madamira-Arapy/arapy/__init__.py b/Madamira-Arapy/arapy/__init__.py
diff --git a/Madamira-Arapy/arapy/arapy.py b/Madamira-Arapy/arapy/arapy.py
@@ -0,0 +1,4 @@
+### Arapy module!
+
+from __future__ import absolute_import
+from __future__ import print_function
diff --git a/Madamira-Arapy/arapy/arwiki.py b/Madamira-Arapy/arapy/arwiki.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+### Purpose: Tools to parse arwiki dumps
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import arapy.normalization as norm
+import re
+import sys
+import codecs
+import xml.etree.cElementTree as etree
+
+def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False):
+    """
+    Reads in an unzipped arwiki dump.
+    Saves the text of the articles in a txt file with one sentence per line.
+    returns the name of the output file
+    """
+    # text tag that wiki uses to identify text content blocks
+    text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text'
+    junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|(<ref\b[^>]*>(.*?)</ref>)|(<ht[^>]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|(<sup>)|(</sup>)|(</sub>)|(<sub>)|(</br>)|(<br>)|(<math>)|(</math>)"
+    punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]"
+
+    with open(dump_in, 'r') as infile:
+        with open(dump_out, 'w') as outfile:
+
+            # iterate through the xml tree looking for tag starts
+            context = etree.iterparse(infile, events = ('start','end'))
+            context = iter(context)
+            event, root = context.next()
+
+            for event, elem in context:
+
+                # if the tag matches the wiki tag for text content, we extract the text
+                if event == 'end' and elem.tag == text_tag:
+
+                    text = elem.text
+                    #print(text)
+
+                    # some text tags are empty
+                    if text:
+
+                        text = re.sub(junkPattern, '', text)
+
+                        if remove_non_arabic:
+                            text = norm.normalize_charset(text)
+
+                        # move each sentence to a new line (rough regex)
+                        if split_at_punc:
+                            text = re.sub(r'[.!?]$', '\n', text)
+
+                        text = re.sub(punctuationPattern, '', text)
+
+                        for line in text.split('\n'):
+                            if line.strip() != '':
+                                outfile.write((line+'\n').encode('utf8'))
+
+                    # keep memory free of previous branches of the xml tree
+                    root.clear()
+
+    return dump_out
diff --git a/Madamira-Arapy/arapy/info/ResultsSummary.txt b/Madamira-Arapy/arapy/info/ResultsSummary.txt
@@ -0,0 +1,9 @@
+Current optimal parameterization for generating arabic word vectors (tested on wiki data):
+
+CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words
+
+Some work that I've read uses 100 dim, I think 200 is better for large data sets.
+
+In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this.
+
+Similarly, larger datasets may be able to take advantage of higher dimensional vectors.
diff --git a/Madamira-Arapy/arapy/info/accuracy-notes.xlsx b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,5 @@ dmypy.json

		# Pyre type checker
		.pyre/

		Madamira-Arapy/arapy/resources/