Skip to content

Commit

Permalink
added Madamira Arapy
Browse files Browse the repository at this point in the history
  • Loading branch information
WissamAntoun committed Jul 27, 2020
1 parent adfad82 commit 3f0c5bc
Show file tree
Hide file tree
Showing 15 changed files with 1,573 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

Madamira-Arapy/arapy/resources/
7 changes: 7 additions & 0 deletions Madamira-Arapy/arapy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Arapy
Arabic text processing tools for python - A work in progress.

# Dependencies
gensim for word2vec: pip install gensim
goslate for translation: pip install gensim
madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/
Empty file.
4 changes: 4 additions & 0 deletions Madamira-Arapy/arapy/arapy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Arapy module!

from __future__ import absolute_import
from __future__ import print_function
63 changes: 63 additions & 0 deletions Madamira-Arapy/arapy/arwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python
# coding: utf-8

### Purpose: Tools to parse arwiki dumps

from __future__ import absolute_import
from __future__ import print_function

import arapy.normalization as norm
import re
import sys
import codecs
import xml.etree.cElementTree as etree

def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False):
"""
Reads in an unzipped arwiki dump.
Saves the text of the articles in a txt file with one sentence per line.
returns the name of the output file
"""
# text tag that wiki uses to identify text content blocks
text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text'
junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|(<ref\b[^>]*>(.*?)</ref>)|(<ht[^>]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|(<sup>)|(</sup>)|(</sub>)|(<sub>)|(</br>)|(<br>)|(<math>)|(</math>)"
punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]"

with open(dump_in, 'r') as infile:
with open(dump_out, 'w') as outfile:

# iterate through the xml tree looking for tag starts
context = etree.iterparse(infile, events = ('start','end'))
context = iter(context)
event, root = context.next()

for event, elem in context:

# if the tag matches the wiki tag for text content, we extract the text
if event == 'end' and elem.tag == text_tag:

text = elem.text
#print(text)

# some text tags are empty
if text:

text = re.sub(junkPattern, '', text)

if remove_non_arabic:
text = norm.normalize_charset(text)

# move each sentence to a new line (rough regex)
if split_at_punc:
text = re.sub(r'[.!?]$', '\n', text)

text = re.sub(punctuationPattern, '', text)

for line in text.split('\n'):
if line.strip() != '':
outfile.write((line+'\n').encode('utf8'))

# keep memory free of previous branches of the xml tree
root.clear()

return dump_out
9 changes: 9 additions & 0 deletions Madamira-Arapy/arapy/info/ResultsSummary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Current optimal parameterization for generating arabic word vectors (tested on wiki data):

CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words

Some work that I've read uses 100 dim, I think 200 is better for large data sets.

In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this.

Similarly, larger datasets may be able to take advantage of higher dimensional vectors.
Binary file added Madamira-Arapy/arapy/info/accuracy-notes.xlsx
Binary file not shown.
Loading

0 comments on commit 3f0c5bc

Please sign in to comment.