-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
adfad82
commit 3f0c5bc
Showing
15 changed files
with
1,573 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,5 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
Madamira-Arapy/arapy/resources/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Arapy | ||
Arabic text processing tools for python - A work in progress. | ||
|
||
# Dependencies | ||
gensim for word2vec: pip install gensim | ||
goslate for translation: pip install gensim | ||
madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/ |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
### Arapy module! | ||
|
||
from __future__ import absolute_import | ||
from __future__ import print_function |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
### Purpose: Tools to parse arwiki dumps | ||
|
||
from __future__ import absolute_import | ||
from __future__ import print_function | ||
|
||
import arapy.normalization as norm | ||
import re | ||
import sys | ||
import codecs | ||
import xml.etree.cElementTree as etree | ||
|
||
def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False): | ||
""" | ||
Reads in an unzipped arwiki dump. | ||
Saves the text of the articles in a txt file with one sentence per line. | ||
returns the name of the output file | ||
""" | ||
# text tag that wiki uses to identify text content blocks | ||
text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text' | ||
junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|(<ref\b[^>]*>(.*?)</ref>)|(<ht[^>]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|(<sup>)|(</sup>)|(</sub>)|(<sub>)|(</br>)|(<br>)|(<math>)|(</math>)" | ||
punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]" | ||
|
||
with open(dump_in, 'r') as infile: | ||
with open(dump_out, 'w') as outfile: | ||
|
||
# iterate through the xml tree looking for tag starts | ||
context = etree.iterparse(infile, events = ('start','end')) | ||
context = iter(context) | ||
event, root = context.next() | ||
|
||
for event, elem in context: | ||
|
||
# if the tag matches the wiki tag for text content, we extract the text | ||
if event == 'end' and elem.tag == text_tag: | ||
|
||
text = elem.text | ||
#print(text) | ||
|
||
# some text tags are empty | ||
if text: | ||
|
||
text = re.sub(junkPattern, '', text) | ||
|
||
if remove_non_arabic: | ||
text = norm.normalize_charset(text) | ||
|
||
# move each sentence to a new line (rough regex) | ||
if split_at_punc: | ||
text = re.sub(r'[.!?]$', '\n', text) | ||
|
||
text = re.sub(punctuationPattern, '', text) | ||
|
||
for line in text.split('\n'): | ||
if line.strip() != '': | ||
outfile.write((line+'\n').encode('utf8')) | ||
|
||
# keep memory free of previous branches of the xml tree | ||
root.clear() | ||
|
||
return dump_out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Current optimal parameterization for generating arabic word vectors (tested on wiki data): | ||
|
||
CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words | ||
|
||
Some work that I've read uses 100 dim, I think 200 is better for large data sets. | ||
|
||
In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this. | ||
|
||
Similarly, larger datasets may be able to take advantage of higher dimensional vectors. |
Binary file not shown.
Oops, something went wrong.