diff --git a/.gitignore b/.gitignore index b6e4761..30c3be0 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ dmypy.json # Pyre type checker .pyre/ + +Madamira-Arapy/arapy/resources/ \ No newline at end of file diff --git a/Madamira-Arapy/arapy/README.md b/Madamira-Arapy/arapy/README.md new file mode 100644 index 0000000..169c63f --- /dev/null +++ b/Madamira-Arapy/arapy/README.md @@ -0,0 +1,7 @@ +# Arapy +Arabic text processing tools for python - A work in progress. + +# Dependencies +gensim for word2vec: pip install gensim +goslate for translation: pip install gensim +madamira package for nlp processing: http://nlp.ldeo.columbia.edu/madamira/ diff --git a/Madamira-Arapy/arapy/__init__.py b/Madamira-Arapy/arapy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Madamira-Arapy/arapy/arapy.py b/Madamira-Arapy/arapy/arapy.py new file mode 100644 index 0000000..c6f7b83 --- /dev/null +++ b/Madamira-Arapy/arapy/arapy.py @@ -0,0 +1,4 @@ +### Arapy module! + +from __future__ import absolute_import +from __future__ import print_function diff --git a/Madamira-Arapy/arapy/arwiki.py b/Madamira-Arapy/arapy/arwiki.py new file mode 100644 index 0000000..c208f72 --- /dev/null +++ b/Madamira-Arapy/arapy/arwiki.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Tools to parse arwiki dumps + +from __future__ import absolute_import +from __future__ import print_function + +import arapy.normalization as norm +import re +import sys +import codecs +import xml.etree.cElementTree as etree + +def parse_arwiki_dump(dump_in, dump_out, split_at_punc=False, remove_non_arabic=False): + """ + Reads in an unzipped arwiki dump. + Saves the text of the articles in a txt file with one sentence per line. + returns the name of the output file + """ + # text tag that wiki uses to identify text content blocks + text_tag = '{http://www.mediawiki.org/xml/export-0.10/}text' + junkPattern = ur"(\{\{[^}]*\}\})|(\[\[[^\]]*\]\])|(\=\=\{[^}]*\})|(\=\=[^=]*\=\=)|(]*>(.*?))|(]*>)|(\[ht[^\]]*\])|(\{[^}]*\})|(<\ref>)|()|()|()|()|(
)|(
)|()|()" + punctuationPattern = ur"[*|,\-#!<&>_+{:/$\\=()?.،'}%\";\[\]]" + + with open(dump_in, 'r') as infile: + with open(dump_out, 'w') as outfile: + + # iterate through the xml tree looking for tag starts + context = etree.iterparse(infile, events = ('start','end')) + context = iter(context) + event, root = context.next() + + for event, elem in context: + + # if the tag matches the wiki tag for text content, we extract the text + if event == 'end' and elem.tag == text_tag: + + text = elem.text + #print(text) + + # some text tags are empty + if text: + + text = re.sub(junkPattern, '', text) + + if remove_non_arabic: + text = norm.normalize_charset(text) + + # move each sentence to a new line (rough regex) + if split_at_punc: + text = re.sub(r'[.!?]$', '\n', text) + + text = re.sub(punctuationPattern, '', text) + + for line in text.split('\n'): + if line.strip() != '': + outfile.write((line+'\n').encode('utf8')) + + # keep memory free of previous branches of the xml tree + root.clear() + + return dump_out \ No newline at end of file diff --git a/Madamira-Arapy/arapy/info/ResultsSummary.txt b/Madamira-Arapy/arapy/info/ResultsSummary.txt new file mode 100644 index 0000000..3fb6b4b --- /dev/null +++ b/Madamira-Arapy/arapy/info/ResultsSummary.txt @@ -0,0 +1,9 @@ +Current optimal parameterization for generating arabic word vectors (tested on wiki data): + +CBOW, window=5, dim=200, neg/samp=25/1e-4, 15+ iterations, lemmatized words + +Some work that I've read uses 100 dim, I think 200 is better for large data sets. + +In extremely large data, some papers hypothesize that skipgrams may work better. I have seen no evidence of this. + +Similarly, larger datasets may be able to take advantage of higher dimensional vectors. \ No newline at end of file diff --git a/Madamira-Arapy/arapy/info/accuracy-notes.xlsx b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx new file mode 100644 index 0000000..82c16db Binary files /dev/null and b/Madamira-Arapy/arapy/info/accuracy-notes.xlsx differ diff --git a/Madamira-Arapy/arapy/madamira.py b/Madamira-Arapy/arapy/madamira.py new file mode 100644 index 0000000..8f32b3f --- /dev/null +++ b/Madamira-Arapy/arapy/madamira.py @@ -0,0 +1,859 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Madamira output processing tools + +from __future__ import absolute_import +from __future__ import print_function +from xml.etree.cElementTree import iterparse + +import arapy +import arapy.normalization as norm +import codecs +import csv +import numpy as np +import os +import re +import requests +import socket +import io +import subprocess +import time + +MADAPORT = 8223 +#94223 + +class Madamira: + url="http://localhost:" + str(MADAPORT) + headers = {'Content-Type': 'application/xml'} + xml_prefix=""" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \n""" + xml_seg_start = """\n""" + xml_seg_end = """\n\n""" + xml_suffix = """ + + """ + config_prefix="{urn:edu.columbia.ccls.madamira.configuration:0.1}" + + def __enter__(self): + self.start_server() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.stop_server() + + def start_server(self): + cwd = os.getcwd() + os.chdir(os.path.dirname(arapy.__file__)+"/resources/MADAMIRA-release-20170403-2.1/") + + self.pid = subprocess.Popen(['java', + '-Xmx2500m', + '-Xms2500m', + '-XX:NewRatio=3', + '-jar', + 'MADAMIRA-release-20170403-2.1.jar', + '-s', + '-msaonly']) + + print("Waiting for madamira to initialize.") + time.sleep(10) + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + result = sock.connect_ex(('localhost',MADAPORT)) + while(result != 0): + sock.close() + time.sleep(1) + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + result = sock.connect_ex(('localhost',MADAPORT)) + + os.chdir(cwd) + + self.session = requests.Session() + + def stop_server(self): + self.session.close() + self.pid.kill() + print("Shut down MADAMIRA.") + + def process(self, text): + """ Returns madamira xml output for a string input """ + + query = io.StringIO() + query.write(Madamira.xml_prefix) + + for sentence in text: + query.write(Madamira.xml_seg_start) + query.write(sentence) + query.write(Madamira.xml_seg_end) + + query.write(Madamira.xml_suffix) + query.seek(0) + + response = self.session.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8')) + + response.encoding = "utf8" + + return MadamiraOutput(response.text) + + def process_sentence(self, text): + """ Returns madamira xml output for a word string input """ + + query = io.StringIO() + query.write(Madamira.xml_prefix) + + query.write(Madamira.xml_seg_start) + query.write(text) + query.write(Madamira.xml_seg_end) + + query.write(Madamira.xml_suffix) + query.seek(0) + + response = requests.post(Madamira.url, headers=Madamira.headers, data=query.read().encode('utf8')) + + response.encoding = "utf8" + + return [word for doc in MadamiraOutput(response.text).docs() for sent in doc.sentences() for word in sent.words()] + +class MadamiraOutput: + def __init__(self, xmltext): + self.xml = xmltext.encode("utf8") + + def docs(self): + # madamira config prefix + mp=Madamira.config_prefix + + # get an iterable TODO use raw string? + # wrapper = codecs.StreamReader(io.StringIO(self.xml), "utf8") + context = iterparse(io.BytesIO(self.xml), events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + for event, elem in context: + + # parse each doc + if event == 'end' and elem.tag == mp+'out_doc': + + yield MadamiraDoc(elem) + + # don't keep the doc in memory + root.clear()#find(mp+'madamira_output').clear() + +class MadamiraDoc: + def __init__(self, elem): + self.elem = elem + + def sentences(self): + mp = Madamira.config_prefix + + for sentence in self.elem.iter(mp+'out_seg'): + + yield MadamiraSentence(sentence) + +class MadamiraSentence: + def __init__(self, sentence): + self.sentence = sentence + + def words(self): + mp = Madamira.config_prefix + + for word in self.sentence.find(mp+'word_info').iter(mp+'word'): + + yield MadamiraWord(word) + + def chunks(self): + mp = Madamira.config_prefix + + # should just be one segment_info per out_seg + # parse each chunk in segment, looking for noun phrases + for chunk in sentence.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'): + yield MadamiraChunk(chunk) + +class MadamiraWord: + """ + Use the get_attribute function to return a selected attribute + + Example XML Word: + + + + + + + + + + + + + + + + + + + """ + svm_predictions_attribute_names_ = ['diac', 'lemma', 'pos', 'prc3', 'prc2', 'prc1', 'prc0', 'per', 'asp', 'vox', 'mod', 'gen', 'num', 'stt', 'cas', 'enc0'] + + def __init__(self, word): + self.word = word + + + + def lemma(self): + mp = Madamira.config_prefix + + # grab the lemma data + lemma = self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma') + + # strip down to the arabic script + if not lemma: + return "" + elif len(lemma) == 0: + return "" + else: + norm_lemma = norm.normalize_charset(lemma).strip() + if len(norm_lemma) == 0: + return lemma + else: + return norm_lemma + + def get_orig_word(self): + mp = Madamira.config_prefix + + return self.word.attrib['word'] + + + def pos(self): + mp = Madamira.config_prefix + + return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('pos') + + def pos_gender(self): + mp = Madamira.config_prefix + + return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('gen') + + def get_attribute(self, attribute=''): + mp = Madamira.config_prefix + return self.word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get(attribute) + + def tokens(self): + mp = Madamira.config_prefix + + # grab the lemma data + tokens = [] + for token in self.word.find(mp+"tokenized[@scheme='MyD3']").iter(mp+'tok'): + tokens.append(token.get('form0')) + + return tokens + + +class MadamiraChunk: + def __init__(self, chunk): + self.chunk = chunk + + def type(self): + return self.get('type') + + # def tokens(): + # # combine tokens into phrase + # for tok in chunk.iter(mp+'tok'): + # segment = tok.get('form0') + + # if segment[-1] == '+': + # noun_phrase += segment[:-1] + # elif segment[0] == '+': + # # if it is a suffix prep, attach it to prev token + # if len(noun_phrase) > 0: + # noun_phrase = noun_phrase[:-1] + segment[1:] + # else: + # noun_phrase = segment[1:] + # else: + # noun_phrase += segment + '_' + + # # drop the last underscore and add to the np sentence + # if noun_phrase[-1] == '_': + # noun_phrase = noun_phrase[:-1] + # sent += noun_phrase+' ' + + + +def transform_sentence_file(sentence_file, + lemmaout="lemmas.txt", + tokenout="token.txt", + posout="pos.txt", + lemmas=True, + pos=False, + tokens=False): + """returns filenames of lemmas and pos files""" + with Madamira() as m: + + # open output files + lemma_out = None + lemma_buff = None + if lemmas: + lemma_buff = io.StringIO() + lemma_file = lemmaout + lemma_out = open(lemma_file, 'w') + + pos_out = None + pos_buff = None + if pos: + pos_buff = io.StringIO() + pos_file = posout + pos_out = open(pos_file, 'w') + + token_out = None + token_buff = None + if tokens: + token_buff = io.StringIO() + token_file = tokenout + token_out = open(token_file, 'w') + + # read files into a list, or buffer the sentences one at a time, of sentences + # sentence_list = open(sentence_file).read().splitlines() + with open(sentence_file, 'r') as sentences: + for sentence in sentences: + + out = m.process([sentence]) + + for doc in out.docs(): + for sent in doc.sentences(): + + for word in sent.words(): + if lemmas: + lemma_buff.write(word.lemma()) + lemma_buff.write(" ") + if pos: + pos_buff.write(word.pos()) + pos_buff.write(" ") + if tokens: + for token in word.tokens(): + token_buff.write(token) + token_buff.write(" ") + + # for chunk in sent.chunks() + # if tokens: + # token_list = word.tokens() + # for token in token_list: + # token_buff.write(token) # TODO + # token_buff.write(" ") + + if lemmas: + + lemma_buff.seek(0) + lemma_out.write(lemma_buff.read().rstrip().encode('utf8')) + lemma_out.write('\n') + lemma_buff.close() + lemma_buff = io.StringIO() + + if pos: + pos_buff.seek(0) + pos_out.write(pos_buff.read().rstrip().encode('utf8')) + pos_out.write('\n') + pos_buff.close() + pos_buff = io.StringIO() + + if tokens: + token_buff.seek(0) + token_out.write(token_buff.read().rstrip().encode('utf8')) + token_out.write('\n') + token_buff.close() + token_buff = io.StringIO() + + if lemmas: + lemma_buff.close() + lemma_out.close() + if pos: + pos_buff.close() + pos_out.close() + if tokens: + token_buff.close() + token_out.close() + + + return [lemma_file, pos_file, token_file] + + + + +# def save_lemmatization(xml_mada_fn, out_fn): +# """ +# Saves a lemmatization from a madamira xml output file +# """ + +# # open the output file +# outfile = codecs.open(out_fn, 'w', "utf-8") + +# # madamira config prefix +# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}' + +# # get an iterable +# context = iterparse(xml_mada_fn, events=("start", "end")) + +# # turn it into an iterator +# context = iter(context) + +# # get the root element +# event, root = context.next() + +# for event, elem in context: + +# # parse each sentence +# if event == 'end' and elem.tag == mp+'out_seg': + +# # construct the sentence, then write once per sentence +# sent = '' + +# # should just be one word_info per out_seg +# # parse each word in word_info +# for word in elem.find(mp+'word_info').iter(mp+'word'): + +# # grab the lemma data +# lemma = word.find(mp+'svm_prediction').find(mp+'morph_feature_set').get('lemma') + +# # strip all but arabic script TODO +# lemma = lemma.split('_')[0] + +# # normalize the script +# lemma = norm.normalize(lemma) + +# sent += lemma +# sent += ' ' + +# # write the sentence out (without last space) +# outfile.write(sent[:-1]+'\n') + +# # don't keep the sentence in memory +# root.find(mp+'out_doc').clear() + +# elif event == 'end' and elem.tag == mp+'out_doc': +# outfile.write('#ENDDOC#\n') + +# def save_noun_phrases(xml_mada_fn, out_fn): +# """ +# Saves noun phrases from a madamira xml output file +# """ + +# # open the output file +# outfile = codecs.open(out_fn, 'w', "utf-8") + +# # madamira config prefix +# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}' + +# # get an iterable +# context = iterparse(xml_mada_fn, events=("start", "end")) + +# # turn it into an iterator +# context = iter(context) + +# # get the root element +# event, root = context.next() + +# for event, elem in context: + +# # parse each sentence +# if event == 'end' and elem.tag == mp+'out_seg': + +# # construct the sentence, then write once per sentence +# sent = '' + +# # should just be one segment_info per out_seg +# # parse each chunk in segment, looking for noun phrases +# for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'): + +# # identify noun phrases +# if chunk.get('type') == 'NP': + +# # we build noun phrases with underscores between words +# noun_phrase = '' + +# # combine tokens into phrase +# for tok in chunk.iter(mp+'tok'): +# segment = tok.get('form0') + +# if segment[-1] == '+': +# noun_phrase += segment[:-1] +# elif segment[0] == '+': +# # if it is a suffix prep, attach it to prev token +# if len(noun_phrase) > 0: +# noun_phrase = noun_phrase[:-1] + segment[1:] +# else: +# noun_phrase = segment[1:] +# else: +# noun_phrase += segment + '_' + +# # drop the last underscore and add to the np sentence +# if noun_phrase[-1] == '_': +# noun_phrase = noun_phrase[:-1] +# sent += noun_phrase+' ' + +# # write the noun phrase sentence out (without last space) +# outfile.write(sent[:-1]+'\n') + +# # don't keep the segment in memory +# root.find(mp+'out_doc').clear() + +# elif event == 'end' and elem.tag == mp+'out_doc': +# outfile.write('#ENDDOC#\n') + +# def save_noun_phrase_graph(xml_mada_fn, out_fn, window = 5): +# """ +# TODO figure out how expensive this is, implement in scala/spark next +# Saves a noun phrase graph from a madamira xml output file +# """ + +# # edges (nodeid, nodeid, dist) +# edges = [] + +# # vertices (long hash(str) : str (noun phrase)) +# vertices = {} + +# # mentions list +# mentions_list = [] + +# # distance to add edges at +# distance = 10 + +# # open the output file +# outfile = codecs.open(out_fn, 'w', "utf-8") + +# # madamira config prefix +# mp='{urn:edu.columbia.ccls.madamira.configuration:0.1}' + +# # get an iterable +# context = iterparse(xml_mada_fn, events=("start", "end")) + +# # turn it into an iterator +# context = iter(context) + +# # get the root element +# event, root = context.next() + +# # document token tracking +# tokens_so_far = 0 + +# for event, elem in context: + +# # parse each sentence +# if event == 'end' and elem.tag == mp+'out_seg': + +# # should just be one segment_info per out_seg +# # parse each chunk in segment, looking for noun phrases +# for chunk in elem.find(mp+'segment_info').find(mp+'bpc').iter(mp+'chunk'): + +# # identify noun phrases +# if chunk.get('type') == 'NP': + +# # we build noun phrases with underscores between words +# noun_phrase = '' + +# # noun phrase starts on next token +# noun_phrase_start = tokens_so_far + 1 + +# # combine tokens into phrase +# for tok in chunk.iter(mp+'tok'): + +# tokens_so_far += 1 + +# segment = tok.get('form0') + +# # builds phrase +# if segment[-1] == '+': +# noun_phrase += segment[:-1] +# elif segment[0] == '+': +# # if it is a suffix prep, attach it to prev token +# if len(noun_phrase) > 0: +# noun_phrase = noun_phrase[:-1] + segment[1:] +# else: +# noun_phrase = segment[1:] +# else: +# noun_phrase += segment + '_' + +# # drop the last underscore and add to the np sentence +# noun_phrase = noun_phrase.strip('_') + +# # noun phrase ended on last token +# noun_phrase_end = tokens_so_far + +# np_hash = hash(noun_phrase) +# vertices[np_hash] = noun_phrase + +# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end]) + +# else: +# for tok in chunk.iter(mp+'tok'): +# tokens_so_far += 1 + +# # don't keep the segment in memory +# root.find(mp+'out_doc').clear() + +# elif event == 'end' and elem.tag == mp+'out_doc': +# # add edges from last document +# for start in range(0, len(mentions_list) - 10): +# end = start + 10 +# head = start +# tail = start + 1 +# in_range = True +# while in_range: +# if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance: +# if (mentions_list[tail][1]-mentions_list[head][2] > 0): +# dist = mentions_list[tail][1]-mentions_list[head][2] +# else: +# dist = 0 + +# edges.append([mentions_list[head][0], mentions_list[tail][0], dist]) +# edges.append([mentions_list[tail][0], mentions_list[head][0], dist]) + +# tail += 1 + +# else: +# in_range = False + +# np.savetxt("edges.csv", np.array(edges), delimiter=",", fmt='%i') +# writer = csv.writer(open('vertices.csv','wb')) +# for key, value in vertices.items(): +# writer.writerow([key, value.encode('utf-8')]) + +# def raw_save_lemmatization(raw_mada_fn, out_fn): +# """ +# Saves a lemmatization from a madamira raw output file +# """ +# mada = codecs.open(mada_fn, 'r', "utf-8") + +# p = re.compile(r'lex:[^\s_]+', re.UNICODE) +# start_of_line = True + +# with codecs.open(out_fn, 'w', 'utf-8') as outfile: +# for line in mada: + +# if line == 'SENTENCE BREAK\n': +# outfile.write('\n') +# start_of_line = True +# elif line.startswith('*'): +# m = p.findall(line) +# if m: +# print(m) +# if not start_of_line: +# outfile.write(' ') +# outfile.write(norm.normalize(m[0][4:])) +# start_of_line = False + +# def raw_save_noun_phrase_graph(raw_bpcbio_fn, out_fn, window = 5): +# """ +# TODO figure out how expensive this is, implement in scala/spark next +# Saves a noun phrase graph from a madamira raw output file +# """ + +# # edges (nodeid, nodeid, dist) +# edges = [] + +# # vertices (long hash(str) : str (noun phrase)) +# vertices = {} + +# # mentions list +# mentions_list = [] + +# # distance to add edges at +# distance = 10 + +# # open the output file +# outfile = codecs.open(out_fn, 'w', "utf-8") + +# # get an iterable +# context = codecs.open(raw_bpcbio_fn, 'r', 'utf-8') + +# # document token tracking +# tokens_so_far = 0 + +# # keep track of NP construction +# noun_phrase = '' +# noun_phrase_start = 0 +# noun_phrase_end = 0 + +# # count sentences if not broken into docs +# sentence_idx = 1 + +# # count docs +# doc_count = 1 + +# with codecs.open('raw_edges.csv', 'a') as edge_file: +# with codecs.open(raw_bpcbio_fn, 'r', 'utf-8') as context: +# while True: +# line = context.readline() + +# if not line or sentence_idx % 100 == 0: + +# sentence_idx = 1 +# print('Doc: ', doc_count, 'Vertices: ', len(vertices)) +# doc_count += 1 + +# if noun_phrase != '': +# noun_phrase = noun_phrase.strip('_') +# noun_phrase_end = tokens_so_far - 1 +# np_hash = hash(noun_phrase) +# vertices[np_hash] = noun_phrase +# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end]) +# noun_phrase = '' + +# # edge_count = 0 + +# # add edges from last document +# for start in range(0, len(mentions_list) - 10): +# # end = start + 10 +# head = start +# tail = start + 1 +# in_range = True +# while in_range and tail < len(mentions_list): +# if abs(mentions_list[tail][1] - mentions_list[head][2]) <= distance: +# if (mentions_list[tail][1]-mentions_list[head][2] > 0): +# dist = mentions_list[tail][1]-mentions_list[head][2] +# else: +# dist = 0 + +# edges.append([mentions_list[head][0], mentions_list[tail][0], dist]) +# edges.append([mentions_list[tail][0], mentions_list[head][0], dist]) + +# # edge_count += 2 +# tail += 1 + +# else: +# in_range = False + +# # print('Adding: ', edge_count, ' edges.') + +# np.savetxt(edge_file, np.array(edges), delimiter=",", fmt='%i') + + +# edges = [] +# mentions_list = [] + +# if not line: +# writer = csv.writer(open('raw_vertices.csv','wb')) +# for key, value in vertices.items(): +# writer.writerow([key, value.encode('utf-8')]) +# break + + +# if line.strip() == "": + +# # end of sentence +# sentence_idx += 1 + +# else: + +# tokens_so_far += 1 + +# text, bpc_type = line.strip().split("\t") + +# if (bpc_type == 'B-NP' or bpc_type != 'I-NP') and noun_phrase != '': + +# noun_phrase = noun_phrase.strip('_') +# noun_phrase_end = tokens_so_far - 1 +# np_hash = hash(noun_phrase) +# vertices[np_hash] = noun_phrase +# mentions_list.append([np_hash, noun_phrase_start, noun_phrase_end]) +# noun_phrase = '' + +# if bpc_type == 'B-NP': + +# # noun phrase starts on this token +# noun_phrase_start = tokens_so_far + +# if text[-1] == '+': +# noun_phrase += text[:-1] +# elif text[0] == '+': +# noun_phrase = text[1:] +# else: +# noun_phrase += text + '_' + +# elif bpc_type == 'I-NP': + +# if text[-1] == '+': +# noun_phrase += text[:-1] +# elif text[0] == '+': +# # if it is a suffix prep, attach it to prev token +# if len(noun_phrase) > 0: +# noun_phrase = noun_phrase[:-1] + text[1:] +# else: +# noun_phrase = text[1:] +# else: +# noun_phrase += text + '_' \ No newline at end of file diff --git a/Madamira-Arapy/arapy/normalization.py b/Madamira-Arapy/arapy/normalization.py new file mode 100644 index 0000000..544c05a --- /dev/null +++ b/Madamira-Arapy/arapy/normalization.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Arabic script normalization tools + +from __future__ import absolute_import +from __future__ import print_function + +import re +import codecs + +# regex for arabic chars +inv_arabic_charset = re.compile(r'[^\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\u0030-\u0039\n\.]+', re.UNICODE) + +def normalize(text, ar_only=True, digits=False, alif=True, hamza=True, yaa=True, tashkil=True): + """ + Normalizes arabic text + Removes non-arabic chars by default + Changes all numerals to # if digits is true, default false + Normalizes alif, hamza, and yaa by default + Removes supplementary diacritics + """ + if ar_only: + text = normalize_charset(text) + + if digits: + text = normalize_digits(text) + + + if alif: + text = normalize_alif(text) + if hamza: + text = normalize_hamza(text) + if yaa: + text = normalize_yaa(text) + + if tashkil: + text = remove_tashkil(text) + + return text + +def normalize_sentence_file(sentence_file, outfile_path="normal.txt", ar_only=True, digits=True, alif=True, hamza=True, yaa=True, tashkil=True): + """ + Normalizes a file of sentences and saves to a file w/ parameterized naming scheme + returns the outfile name + """ + + # outfile_path = (sentence_file.split('.')[0]+ + # "_ar_only"+str(ar_only)+ + # "_digits"+str(digits)+ + # "_alif"+str(alif)+ + # "_hamza"+str(hamza)+ + # "_yaa"+str(yaa)+ + # "_tashkil"+str(tashkil)+ + # ".txt") + + with open(sentence_file, 'r') as infile: + with open(outfile_path, 'w') as outfile: + for text in infile: + text = text.decode('utf8') + + text = normalize(text, ar_only=ar_only, digits=digits, alif=alif, hamza=hamza, yaa=yaa, tashkil=tashkil) + + if text: + outfile.write(text.encode('utf8')) + + return outfile_path + +def remove_tashkil(text): + """ removes set of arabic supplementary diacritics """ + text = remove_harakat(text) + text = remove_tanwin(text) + text = remove_shaddah(text) + text = remove_kashida(text) + return text + +##################### +### Normalization ### +##################### + +def normalize_charset(text): + return inv_arabic_charset.sub(' ', text) + +def normalize_digits(text): + """ replaces all forms of numbers with # """ + return re.sub(r'[0123456789٠١٢٣٤٥٦٧٨٩]', r'#', text) + +def normalize_alif(text): + """ replaces all forms of alif with ا """ + return re.sub(r'[إأٱآا]', r'ا', text) + +def normalize_yaa(text): + """ replaces ى with ي """ + return re.sub(r'ى', r'ي', text) + +def normalize_hamza(text, normalize_alif = False): + """ + replaces hamza on seats with ء + does not include alif seats by default + set normalize_alif=True to replace إأ with hamza + """ + if normalize_alif: + return re.sub(r'[ؤئإأ]', r'ء', text) + else: + return re.sub(r'[ؤئ]', r'ء', text) + +####################### +### Tashkil removal ### +####################### + +def remove_harakat(text): + """ + removes short vowel marks + does not normalize alif forms + does not remove tanwin (ًٌٍ) (use remove_tanwin) + """ + return re.sub(r'[َُِْ]', r'', text) + +def remove_tanwin(text): + """ + removes tanwin vowel marks + does not normalize alif forms + """ + return re.sub(r'[ًٌٍ]', r'', text) + +def remove_shaddah(text): + """ + removes the shaddah mark (tashdid) + """ + return re.sub(r'[ّ]', r'', text) + +def remove_kashida(text): + """ + removes the kashida elongation mark (tatwil) + """ + return re.sub(r'[ـ]', r'', text) + + text = re.sub(noise, '', text) + return text + +################################## +### Buckwalter transliteration ### +################################## + +def unicode_to_bw(string, reverse=0): + """ + Given a Unicode string, transliterate into Buckwalter. + To go from Buckwalter back to Unicode, set reverse=1. + Partially taken from https://github.com/andyroberts/buckwalter2unicode + """ + + buck2uni = {"'": u"\u0621", # hamza-on-the-line + "|": u"\u0622", # madda + ">": u"\u0623", # hamza-on-'alif + "&": u"\u0624", # hamza-on-waaw + "<": u"\u0625", # hamza-under-'alif + "}": u"\u0626", # hamza-on-yaa' + "A": u"\u0627", # bare 'alif + "b": u"\u0628", # baa' + "p": u"\u0629", # taa' marbuuTa + "t": u"\u062A", # taa' + "v": u"\u062B", # thaa' + "j": u"\u062C", # jiim + "H": u"\u062D", # Haa' + "x": u"\u062E", # khaa' + "d": u"\u062F", # daal + "*": u"\u0630", # dhaal + "r": u"\u0631", # raa' + "z": u"\u0632", # zaay + "s": u"\u0633", # siin + "$": u"\u0634", # shiin + "S": u"\u0635", # Saad + "D": u"\u0636", # Daad + "T": u"\u0637", # Taa' + "Z": u"\u0638", # Zaa' (DHaa') + "E": u"\u0639", # cayn + "g": u"\u063A", # ghayn + "_": u"\u0640", # taTwiil + "f": u"\u0641", # faa' + "q": u"\u0642", # qaaf + "k": u"\u0643", # kaaf + "l": u"\u0644", # laam + "m": u"\u0645", # miim + "n": u"\u0646", # nuun + "h": u"\u0647", # haa' + "w": u"\u0648", # waaw + "Y": u"\u0649", # 'alif maqSuura + "y": u"\u064A", # yaa' + "F": u"\u064B", # fatHatayn + "N": u"\u064C", # Dammatayn + "K": u"\u064D", # kasratayn + "a": u"\u064E", # fatHa + "u": u"\u064F", # Damma + "i": u"\u0650", # kasra + "~": u"\u0651", # shaddah + "o": u"\u0652", # sukuun + "`": u"\u0670", # dagger 'alif + "{": u"\u0671", # waSla + } + + # For a reverse transliteration (Unicode -> Buckwalter), a dictionary + # which is the reverse of the above buck2uni is essential. + + uni2buck = {} + + # Iterate through all the items in the buck2uni dict. + for (key, value) in buck2uni.iteritems(): + # The value from buck2uni becomes a key in uni2buck, and vice + # versa for the keys. + uni2buck[value] = key + + if not reverse: + for k,v in buck2uni.iteritems(): + string = string.replace(v,k) + + else: + for k,v in buck2uni.iteritems(): + string = string.replace(k,v) + + return string + + diff --git a/Madamira-Arapy/arapy/thesaurus.py b/Madamira-Arapy/arapy/thesaurus.py new file mode 100644 index 0000000..b1426db --- /dev/null +++ b/Madamira-Arapy/arapy/thesaurus.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Arabic word thesaurus simulation tools + +from __future__ import absolute_import +from __future__ import print_function + +import arapy.translate as trans +import logging +import sys +import json +import requests +import urllib2 + + +API_KEY = '80901dbb851efc07b4bd747ba3ead0ae' # API key is available from here - http://words.bighugelabs.com/getkey.php +URL_MASK = 'http://words.bighugelabs.com/api/2/{1}/{0}/json' +RELATIONSHIP_ABBR = {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'} + +def thesaurus(word, relation=None, ngram=0, ar=False, target_result_count=0): + """ + Uses bighugelabs thesaurus API + requires the API key available http://words.bighugelabs.com/getkey.php + + Translation is done with google translate in the translate module. Requires google api key + + Takes in a word and retreives a list of related words where + the relationship is given by one key from {'syn':'Synonyms','ant':'Antonyms','rel':'Related terms','sim':'Similar terms','usr':'User suggestions', None:'All'} + the words are filtered by ngram, 0 for all + if ar = 0, the word is translated before and after from arabic + target_result_count is the number of words to return with + + returns a dictionary where keys are the requested relationships, and values are lists of ngrams matching that relationship + returns empty dictionary if the thesaurus didn't have any results + """ + + gs = None + if ar: + translations = trans.translate_list([word], 'en', 'ar') + if len(translations) > 0: + word = translations[0] + else: + logging.info("Couldn't translate word: "+str(word)+" to english") + return {} + + if not word: + logging.info("Translated word is empty.") + return {} + + # format and make the request + url = URL_MASK.format(urllib2.quote(word.encode('utf-8')), API_KEY) + result = requests.get(url) + + if not result.text: + logging.info("Thesaurus had no info for word:"+word.encode('utf-8')) + return {} + + json_result = json.loads(result.text) + + # our relationship dictionary + words = {} + word_count = 0 + + # for each sense of the word + for pos in json_result: + + # we want only the requested relations + for rel in json_result[pos]: + if relation == None or relation == rel: + + # each word matching the relationship + for w in json_result[pos][rel]: + + candidate = w + + # we only want so many results + if target_result_count == 0 or word_count < target_result_count: + + if ar == 1: + translations = trans.translate_list([candidate],'ar','en') + if len(translations) > 0: + candidate = translations[0] + else: + logging.info("Couldn't translate word: "+str(candidate)+" to arabic") + return {} + + if ngram == 0 or len(candidate.split(" ")) == ngram: + if not rel in words: + words[rel] = [] + words[rel].append(candidate) + word_count+=1 + + else: + # we have enough results + return words + return words + + + diff --git a/Madamira-Arapy/arapy/translate.py b/Madamira-Arapy/arapy/translate.py new file mode 100644 index 0000000..85ca3b1 --- /dev/null +++ b/Madamira-Arapy/arapy/translate.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Arabic word translation tools for google translate api: https://cloud.google.com/translate/v2/pricing + +from __future__ import absolute_import +from __future__ import print_function + +import logging +import sys +import json +import requests +import urllib2 + + +GOOGLE_API_KEY = 'AIzaSyAAScZ3-Ut-1sxn5gsSLzxzXJgzn3jGsN4' +URL_MASK = 'https://www.googleapis.com/language/translate/v2?key={0}{1}&source={2}&target={3}' + +def translate_list(words, target='ar', source='en'): + """ + Translates a word with the google translate api + Requires an API key from the google developers console + Target is the language target, ie 'ar' + """ + + # format the words for the url + formatted_words="" + for word in words: + formatted_words += "&q=" + urllib2.quote(word) + + # format the url for the get + url = URL_MASK.format(GOOGLE_API_KEY, formatted_words, source, target) + result = requests.get(url) + + if not result.text: + logging.info("Google responded with no translations, check api key.") + return [] + + json_result = json.loads(result.text) + + if not 'data' in json_result: + logging.info("Google result had no data element, check api key.") + return [] + + # parse the result + translations = [] + for translation in json_result['data']['translations']: + trans_word = translation['translatedText'] + logging.info("Translated "+str(word)+ " to "+trans_word.encode('utf-8')) + translations.append(trans_word) + + return translations \ No newline at end of file diff --git a/Madamira-Arapy/arapy/word2vec.py b/Madamira-Arapy/arapy/word2vec.py new file mode 100644 index 0000000..f65898d --- /dev/null +++ b/Madamira-Arapy/arapy/word2vec.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# coding: utf-8 + +### Purpose: Arabic word embedding tools + +from __future__ import absolute_import +from __future__ import print_function + +from gensim.models import Word2Vec +from gensim.models.word2vec import LineSentence +import logging +import sys + +def train_embeddings(infile, outfile = "embedding.txt", sg=1, size=100, seed = 0, window=8, min_count=5, + sample=1e-4, hs=0, negative=25, iterations=15): + """ + Saves the model to a file with the parameters in the name. + All of these functions work on any language of corpora + Uses gensim's training parameters: + + Initialize the model from an iterable of `sentences`. Each sentence is a + list of words (unicode strings) that will be used for training. + + The `sentences` iterable can be simply a list, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in + this module for such examples. + + If you don't supply `sentences`, the model is left uninitialized -- use if + you plan to initialize it in some other way. + + `sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. Otherwise, `cbow` is employed. + + `size` is the dimensionality of the feature vectors. + + `window` is the maximum distance between the current and predicted word within a sentence. + + `alpha` is the initial learning rate (will linearly drop to zero as training progresses). + + `seed` = for the random number generator. Initial vectors for each + word are seeded with a hash of the concatenation of word + str(seed). + + `min_count` = ignore all words with total frequency lower than this. + + `sample` = threshold for configuring which higher-frequency words are randomly downsampled; + default is 0 (off), useful value is 1e-5. + + `workers` = use this many worker threads to train the model (=faster training with multicore machines). + + `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). + + `negative` = if > 0, negative sampling will be used, the int for negative + specifies how many "noise words" should be drawn (usually between 5-20). + + `cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. + Only applies when cbow is used. + + `hashfxn` = hash function to use to randomly initialize weights, for increased + training reproducibility. Default is Python's rudimentary built in hash function. + + `iter` = number of iterations (epochs) over the corpus. + """ + # set up logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) + + # files are iterated over with this object + # class MySentences(object): + # def __init__(self, fname): + # self.fname = fname + # self.errors = 0 + + # def __iter__(self): + # for line in open(self.fname): + # yield line.split() + + # sentences = MySentences(infile) + sentences = LineSentence(infile) + + + model = Word2Vec(sentences, + sg = sg, + size = size, + window = window, + min_count = min_count, + hs = hs, + workers = 4, + sample = sample, + seed = seed, + negative = negative, + iter = iterations) + + model.save_word2vec_format(outfile, binary = True) + + return outfile + +def start_interactive_test_suite(): + """ + Loads a model, then allows interactive tests of: + ac - not interactive, rather loads an analogy file and outputs the results + one word most similar queries + two word similarity measures + three word analogy queries + four+ word odd one out queries + """ + + output_spacing = 25 + + modelfile = raw_input('Please enter the binary model file path: ')# (or gn/en/ar): ') + modelfile = modelfile.strip().strip('\'') + + # if modelfile == 'gn': + # modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin' + # elif modelfile == 'ar': + # modelfile = '/Users/king96/Documents/Word2Vec/Models/ar_wiki_seg_vecs.bin' + # elif modelfile == 'en': + # modelfile = '/Users/king96/Documents/Word2Vec/Models/en_wiki_vecs.bin' + + # set up logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + + # load model + model = Word2Vec.load_word2vec_format(modelfile, binary=True) + + while True: + + # offer the menu + print('\n') + print('Type ac to run accuracy tests.') + print('Enter one word for neighbors, two for distance,') + print('three for analogy, more for matching, q to quit.') + words = raw_input('Word: ') + + words = words.decode('UTF-8', 'replace') + + if words == 'q': + break + + if words == 'ac': + print('Please enter the questions file to test on:') + + questions = raw_input('File: ').strip() + + model.accuracy(questions, restrict_vocab = 30000, tries = 5) + continue + + # the remaining options take 0 < n query words + words = words.split(' ') + + if len(words) == 0: + continue + + # top 10 words + elif len(words) == 1: + try: + candidates = model.most_similar(words[0], topn=10) + print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing)) + for word in candidates: + print(str(word[0].encode('UTF-8','replace')).rjust(output_spacing), + str(word[1]).rjust(output_spacing)) + except KeyError as ke: + print(ke.message.encode('utf-8','replace')) + + + # pair similarity + elif len(words) == 2: + try: + print('Similarity is : ' + str(model.similarity(words[0],words[1]))) + except KeyError as ke: + print(ke.message.encode('utf-8','replace')) + + # analogy + elif len(words) == 3: + try: + candidates = model.most_similar(positive=[words[2], words[1]], + negative = [words[0]], + topn=10) + + print('Candidates'.rjust(output_spacing), 'Cos Distance'.rjust(output_spacing)) + for word in candidates: + print(str(word[0].encode('UTF-8', 'replace')).rjust(output_spacing), + str(word[1]).rjust(output_spacing)) + except KeyError as ke: + print(ke.message.encode('utf-8','replace')) + + # odd one out + else: + try: + print('Odd one out: ' + str(model.doesnt_match(words).encode('utf-8', 'replace'))) + except KeyError as ke: + print(ke.message.encode('utf-8','replace')) + +def start_query_expander(): + modelfile = raw_input('Please enter the binary model file path: ') + modelfile = modelfile.strip() + + # if modelfile == 'gn': + # modelfile = '/Users/king96/Documents/Word2Vec/Models/google_news_vecs.bin' + + # set up logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + + # load model + model = Word2Vec.load_word2vec_format(modelfile, binary=True) + + while True: + + words = raw_input('\nEnter words to expand, q to quit: ') + + words = words.decode('UTF-8', 'replace') + + if words == 'q': + break + + words = words.split(' ') + + if len(words) == 0: + continue + + # top 10 words + else: + expansion = set() + + for word in words: + try: + expansion = expansion | set([x[0] for x in model.most_similar(word, topn=10)]) + except KeyError as ke: + print(ke.message.encode('utf-8','replace')) + + print('Expansion') + for word in expansion: + print(str(word.encode('UTF-8','replace'))) \ No newline at end of file diff --git a/Madamira-Arapy/test_madamira.py b/Madamira-Arapy/test_madamira.py new file mode 100644 index 0000000..8981ed3 --- /dev/null +++ b/Madamira-Arapy/test_madamira.py @@ -0,0 +1,17 @@ +#%% +from arapy.madamira import Madamira + +#%% +text = "ما هي صفات السبعين ألفا الذين يدخلون الجنة بغير حساب" + +with Madamira() as m: + out = m.process([text]) + +# %% +for doc in out.docs(): + for sent in doc.sentences(): + for word in sent.words(): + print(word.get_orig_word(),": ", word.pos(),"--",word.get_attribute('gen'),"--",word.get_attribute('per')) + + +# %% diff --git a/README.md b/README.md index 5d67b67..4b65523 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ # misc misc code and stuff + +`pdf_renamer`: automatically extract the title from a PDF file (special case when working with files from ACL Anthology) + +`Madamira-Arapy`: forked from https://github.com/jordanking/arapy added compatibility with python 3.6+ and some new functions to return morphological attributes (only fixed the Madamira files) diff --git a/pdf_renamer/monitor.ps1 b/pdf_renamer/monitor.ps1 index 07993a8..1ff965c 100644 --- a/pdf_renamer/monitor.ps1 +++ b/pdf_renamer/monitor.ps1 @@ -1,7 +1,7 @@ ### SET FOLDER TO WATCH + FILES TO WATCH + SUBFOLDERS YES/NO $watcher = New-Object System.IO.FileSystemWatcher $watcher.Path = "C:\Users\WISSAM-PC\Downloads\Documents\" -$watcher.Filter = "*.*" +$watcher.Filter = "*.pdf" $watcher.IncludeSubdirectories = $true $watcher.EnableRaisingEvents = $true