diff --git a/.gitignore b/.gitignore index eb62ac7..33d89ef 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ /XML /CSV *.csv +/model/.R* +/embeddings/embeddings +/embeddings/.env +/embeddings/eval/questions-words.txt +/data/ +/sentiment/*.csv \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5e320a5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 UU Centre for Digital Humanities - Research Software Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 02ca6b4..e0f612d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,30 @@ -# Reader-responses-to-translated-literature +# Reader responses to translated literature -This repo contains the work we do in the Research-IT project with Haidee Kotze and Gys-Walt van Egdom. +This repository contains code for the [DIOPTRA-L](https://cdh.uu.nl/portfolio/digital-opinions-on-translated-literature-dioptra-l-2/) project by Haidee Kotze, Gys-Walt van Egdom, Corina Koolen and Utrecht University's Research Software Lab, and can be used to reproduce the publication +Kotze, Haidee & Janssen, Berit & Koolen, Corina & Plas, Luka & Egdom, Gys-Walt. (2021). Norms, affect and evaluation in the reception of literary translations in multilingual online reading communities: Deriving cognitive-evaluative templates from big data. Translation, Cognition & Behavior. 4. 10.1075/tcb.00060.kot. + +## Prerequisites +### Python +Most of the scripts require Python 3.6. To install dependencies, run +`pip install -r requirements.txt` + +### R +The statistical analysis and visualization was performed in R, using the following libraries: +- coin +- dplyr +- ggplot2 +- Hmisc +- irr +- lme4 +- reshape2 +- rstatix + +## Steps to reproduce +1. scrapers: Python scripts used to scrape reviews from Goodreads. Documentation on usage in that folder's README. +2. preprocessing: Python scripts used to clean the data, and more specifically, tokenization. +3. embeddings: Jupyter notebooks for training and evaluating word embeddings using word2vec. As the dataset is relatively small, the resulting embeddings were not informative for further research. +4. analysis: Python scripts to collect and count translation lemmas, based on human annotations. +5. collocations: Python scripts for finding collocations surrounding translation lemmas +6. sentiment: Python scripts to count positive / negative and hedge terms in collocations. +7. model: R scripts used to generate statistics and visualizations of the data. -For now, it contains a GoodReads scraper, in the 'scrapers' folder. See the README over there if you want to do some scraping. diff --git a/analysis/__init__.py b/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis/count_data.py b/analysis/count_data.py new file mode 100644 index 0000000..75df477 --- /dev/null +++ b/analysis/count_data.py @@ -0,0 +1,130 @@ +import csv +import re +import pandas as pd +from typing import List, Dict +from collections import defaultdict +from collocations.patterns import LANGUAGES_PATTERNS + +WORDS_PATH = './data/word_classes.csv' +INPUT_FACTORS = ['original_language', 'edition_language', + 'book_title', 'language', 'age_category', 'book_genre', 'rating_no'] +WINDOW_SIZE = 4 + + +def read_categories(words_path): + with open(WORDS_PATH) as words_file: + reader = csv.DictReader(words_file) + word_categories = {language: defaultdict( + lambda: 'other') for language in LANGUAGES_PATTERNS} + + for row in reader: + language = row['language'] + word = row['word'] + cat = row['category'] + word_categories[language][word] = cat + + categories_generator = ( + value for lang_data in word_categories.values() for value in lang_data.values()) + categories = list(sorted(set(categories_generator))) + + return word_categories, categories + + +def output_count(words: List[str], word_categories: Dict, categories: List): + counts = {cat: 0 for cat in categories} + + for word in words: + cat = word_categories[word] + counts[cat] += 1 + + return counts + + +def count_data(reviews_path, language='english'): + '''Create a table with one row for each mention of "translation". Includes + some info about the review and the categories of words in the context window.''' + # import word categories + word_categories, categories = read_categories(WORDS_PATH) + + # import reviews + with open(reviews_path) as reviews_file: + reader = csv.DictReader(reviews_file) + + all_data = [] + + for row in reader: + text = row['tokenised_text'] + language = row['language'].lower() + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + input_data = {factor: row[factor] for factor in INPUT_FACTORS} + + for i, word in enumerate(words): + if re.search(pattern, word): + preceding = [words[j] + for j in range(i - WINDOW_SIZE, i) if j >= 0] + following = [words[j] for j in range( + i + 1, i + 1 + WINDOW_SIZE) if j < len(words)] + window = preceding + following + output_data = output_count( + window, word_categories[language], categories) + + data = {**input_data, **output_data} + all_data.append(data) + + df = pd.DataFrame(all_data, columns=INPUT_FACTORS + categories) + return df + + +def mentions_translation(text, language): + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + return str(int(any(re.search(pattern, word) for word in words))) + else: + return None + + +def count_translation_mentions(text, language): + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + return str(sum(1 for word in words if re.search(pattern, word))) + else: + return None + + +def count_data_per_review(reviews_path): + '''Create table with one row for each review. Similar to the + I-Analyzer output, but with some extra info. We also ignore some + columns like the full text.''' + + with open(reviews_path) as reviews_file: + reader = csv.DictReader(reviews_file) + + def review_data(row): + input_data = {factor: row[factor] for factor in INPUT_FACTORS} + is_translated = int(row['original_language'] + != row['edition_language']) + mentions = mentions_translation( + row['tokenised_text'], row['language'].lower()) + mention_count = count_translation_mentions( + row['tokenised_text'], row['language'].lower()) + words = len(row['text'].split()) + cleaned_words = len(row['tokenised_text'].split()) + data = { + **input_data, + 'is_translated': is_translated, + 'mentions_translation': mentions, + 'mention_count': mention_count, + 'words': words, + 'cleaned_words': cleaned_words + } + return data + + all_data = [review_data(row) for row in reader] + + df = pd.DataFrame(all_data, columns=INPUT_FACTORS + + ['is_translated', 'mentions_translation', 'mention_count', 'words', 'cleaned_words']) + return df diff --git a/analysis/write_review_table.py b/analysis/write_review_table.py new file mode 100644 index 0000000..4ed5b36 --- /dev/null +++ b/analysis/write_review_table.py @@ -0,0 +1,10 @@ +# write table with 1 row per review + +import pandas as pd +from analysis.count_data import count_data_per_review + +reviews_path = './data/goodreads_tokenised.csv' +export_path = './data/goodreads_review_data.csv' + +data = count_data_per_review(reviews_path) +data.to_csv(export_path, na_rep='NA') \ No newline at end of file diff --git a/analysis/write_table.py b/analysis/write_table.py new file mode 100644 index 0000000..4047430 --- /dev/null +++ b/analysis/write_table.py @@ -0,0 +1,10 @@ +# write table with 1 row per mention of translation + +import pandas as pd +from analysis.count_data import count_data + +reviews_path = './data/goodreads_tokenised.csv' +export_path = './data/goodreads_formatted.csv' + +data = count_data(reviews_path) +data.to_csv(export_path) \ No newline at end of file diff --git a/collocations/__init__.py b/collocations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/collocations/__main__.py b/collocations/__main__.py new file mode 100644 index 0000000..d356dce --- /dev/null +++ b/collocations/__main__.py @@ -0,0 +1,6 @@ +from collocations.collocations import collocate +from collocations.patterns import LANGUAGES_PATTERNS + +for language, pattern in LANGUAGES_PATTERNS.items(): + if pattern: + collocate(language, pattern) \ No newline at end of file diff --git a/collocations/collocations.py b/collocations/collocations.py new file mode 100644 index 0000000..da7cc2f --- /dev/null +++ b/collocations/collocations.py @@ -0,0 +1,70 @@ +import csv +import re +from math import log + +def collocate(language, target_pattern, reviews_path = './data/goodreads_tokenised.csv', out_name = 'collocations', out_dir = './data'): + + out_path = '{}/{}_{}.txt'.format(out_dir, out_name, language) + + # import reviews + + with open(reviews_path) as csvfile: + reader = csv.DictReader(csvfile) + reviews_text = (row["tokenised_text"] for row in reader if row["language"].lower() == language) + reviews = [review.split() for review in reviews_text] + + # frequencies + + vocab = set(word for review in reviews for word in review) + + def count_words(): + counts_general = {word: 0 for word in vocab} + counts_translat = {word: 0 for word in vocab} + window = 4 + + for review in reviews: + for i, word in enumerate(review): + counts_general[word] += 1 + + if re.search(target_pattern, word): + preceding = [review[j] for j in range(i - window, i) if j >= 0] + following = [review[j] for j in range(i + 1, i + 1 + window) if j < len(review)] + + for neighbour in preceding + following: + counts_translat[neighbour] += 1 + + return counts_translat, counts_general + + def filter_counts(target, general): + filtered_vocab = set(word for word, count in general.items() if count > 1) + filtered_target = {word: count for word, count in target.items() if word in filtered_vocab} + filtered_general = {word: count for word, count in general.items() if word in filtered_vocab} + + return filtered_target, filtered_general + + def relative_counts(counts): + total = sum(counts.values()) + return {word: count / total for word, count in counts.items()} + + def counts_log(counts): + return {word: log(count + 1) for word, count in counts.items()} + + def relative_frequencies(target, general): + target_log = counts_log(relative_counts(target)) + general_log = counts_log(relative_counts(general)) + + return {word: target_log[word] - general_log[word] for word in target} + + counts_translat, counts_general = filter_counts(*count_words()) + rel_freq = relative_frequencies(counts_translat, counts_general) + + def sort_by_frequency(counts): + return sorted(counts, key = lambda w : counts[w], reverse = True) + + ranking = sort_by_frequency(rel_freq) + + #export + with open(out_path, 'w') as outfile: + for i in range(100): + outfile.write(ranking[i]) + outfile.write('\n') diff --git a/collocations/patterns.py b/collocations/patterns.py new file mode 100644 index 0000000..44bbcaa --- /dev/null +++ b/collocations/patterns.py @@ -0,0 +1,9 @@ +LANGUAGES_PATTERNS = { + 'dutch' : r'^vertaa?l', + 'english' : r'^translat', + 'french' : r'^tradu', + 'german' : r'[uü]bersetz', + 'italian' : r'^tradu', + 'portuguese' : r'^tradu', + 'spanish' : r'^tradu', +} \ No newline at end of file diff --git a/collocations/test_patterns.py b/collocations/test_patterns.py new file mode 100644 index 0000000..5511894 --- /dev/null +++ b/collocations/test_patterns.py @@ -0,0 +1,26 @@ +# print the set of words that match the pattern for each language + +import csv +import re +from collocations.patterns import LANGUAGES_PATTERNS + +reviews_path = './data/goodreads_tokenised.csv' + +matching_words = {language: set() for language in LANGUAGES_PATTERNS} + +with open(reviews_path) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + language = row['language'].lower() + words = row['tokenised_text'].split() + + is_match = lambda word: re.search(LANGUAGES_PATTERNS[language], word) + matches = filter(is_match, words) + for match in matches: + matching_words[language].add(match) + +for language in matching_words: + print(language.upper()) + print() + print(matching_words[language]) + print() \ No newline at end of file diff --git a/embeddings/eval/__init__.py b/embeddings/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embeddings/eval/__main__.py b/embeddings/eval/__main__.py new file mode 100644 index 0000000..f23bbec --- /dev/null +++ b/embeddings/eval/__main__.py @@ -0,0 +1,25 @@ +from gensim import models + +EMBEDDINGS_PATH = "embeddings/with_pretraining" + +model = models.Word2Vec.load(EMBEDDINGS_PATH) + +#automatic evaluation +print('QUESTION ANSWER DATASET') +accuracy = model.accuracy("eval/questions-words.txt") + +for section in accuracy: + name = section['section'] + correct = len(section['correct']) + incorrect = len(section['incorrect']) + print('{}: {} out of {}'.format(name, correct, incorrect)) + + +#manual inspection +print() +words = ['book', 'author', 'genre', 'boring', 'recommend', 'translation'] + +print('WORD SIMILARITES') +for word in words: + neighbours = [neighbour for neighbour, score in model.most_similar(word)] + print("'{}' is most similar to '{}', '{}', or '{}'".format(word, neighbours[0], neighbours[1], neighbours[2])) diff --git a/embeddings/readme.md b/embeddings/readme.md new file mode 100644 index 0000000..31b511e --- /dev/null +++ b/embeddings/readme.md @@ -0,0 +1,32 @@ +English embeddings for goodreads reviews. + +Setup for a python 3.6 environment: +```bash +pip install gensim==3.8 +pip install nltk +python -m train.download +``` +The train.download module downloads the Brown corpus for pretraining and a question-answer dataset for evaluation. The reviews have to be downloaded manually from I-analyzer. + +Embeddings can be trained from the command line: +```bash +python -m train +``` +There are no command line options, so edit `train/__main__.py` to configure hyperparameters. + +To evaluate: +```bash +python -m eval +``` +Check that the code in `eval/__main__.py` uses the right embeddings. + + +For the visualisation of results: +Setup with +```bas +pip install jupyter +pip install matplotlib +pip install sklearn +``` + +Open `visualise/plot.ipynb` as a jupyter notebook. \ No newline at end of file diff --git a/embeddings/train/__init__.py b/embeddings/train/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embeddings/train/__main__.py b/embeddings/train/__main__.py new file mode 100644 index 0000000..1a78589 --- /dev/null +++ b/embeddings/train/__main__.py @@ -0,0 +1,43 @@ +import os +from train.corpus_reader import ReviewCorpus, BrownCorpus +from train.train_embeddings import train +from gensim import models + +CORPUS_PATH = "../data/reviews_english.csv" #corpus location +DST_DIR = "embeddings" #directory for embeddings +FILENAME = "with_pretraining" #name of embeddings file + +PRETRAIN = True #pretrain on Brown corpus +SPLIT_SENTS = True #split reviews into sentences +WINDOW = 5 #window size for words + +#initialise corpus +print("Initialising corpus...") +sentences = ReviewCorpus(CORPUS_PATH, split_sentences=SPLIT_SENTS) + +#train embeddings +if PRETRAIN: + pretrained_name = 'pretrained' + if WINDOW != 5: + pretrained_name += '_w{}'.format(WINDOW) + + if not pretrained_name in os.listdir(DST_DIR): + print('No pretrained embeddings found. Pretraining...') + pretrain_sentences = BrownCorpus() + pretrained = train(pretrain_sentences, window=WINDOW) + pretrained.save(os.path.join(DST_DIR, pretrained_name)) + else: + print('Found pretrained embeddings.') + pretrained = models.Word2Vec.load(os.path.join(DST_DIR, pretrained_name)) + + print('Training model...') + model = train(sentences, pretrained = pretrained) +else: + print('Training model...') + model = train(sentences, window=WINDOW) + +#export +out_path = os.path.join(DST_DIR, FILENAME) + +print("Saving model to '{}'...".format(out_path)) +model.save(out_path) \ No newline at end of file diff --git a/embeddings/train/corpus_reader.py b/embeddings/train/corpus_reader.py new file mode 100644 index 0000000..3a68e3d --- /dev/null +++ b/embeddings/train/corpus_reader.py @@ -0,0 +1,55 @@ +import csv +from gensim import utils +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import brown +import re + +def token(word): + if re.match('^(\d)+$', word): + return '' + if not re.search('\w', word): + return '' + return word.lower() + +class ReviewCorpus(): + def __init__(self, path, split_sentences = True): + self.split_sents = split_sentences + with open(path) as csvfile: + reader = csv.DictReader(csvfile) + reviews = [row["text"] for row in reader] + + self.reviews = [self.preprocess(review) for review in reviews] + + def __len__(self): + if self.split_sents: + return sum(len(review) for review in self.reviews) + else: + return len(self.reviews) + + def __iter__(self): + for review in self.reviews: + if self.split_sents: + for sent in review: + yield sent + else: + yield review + + def preprocess(self, review): + def tokens(string): + return [token(word) for word in word_tokenize(string)] + + if self.split_sents: + sents = sent_tokenize(review) + return [tokens(sent) for sent in sents] + else: + return tokens(review) + +class BrownCorpus(): + def __init__(self): + pass + + def __iter__(self): + for sent in brown.sents(): + words = [token(word) for word in sent] + yield words + diff --git a/embeddings/train/download.py b/embeddings/train/download.py new file mode 100644 index 0000000..dbcd16d --- /dev/null +++ b/embeddings/train/download.py @@ -0,0 +1,14 @@ +#download brown corpus + +import nltk + +nltk.download('brown') + +#download question answer dataset + +import requests + +url = 'https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/docs/notebooks/datasets/questions-words.txt' +r = requests.get(url) +with open('eval/questions-words.txt', 'wb') as file: + file.write(r.content) \ No newline at end of file diff --git a/embeddings/train/train_embeddings.py b/embeddings/train/train_embeddings.py new file mode 100644 index 0000000..62d7b1d --- /dev/null +++ b/embeddings/train/train_embeddings.py @@ -0,0 +1,17 @@ +from gensim import models + +def train(sentences, pretrained = None, window = 5): + if pretrained: + #train from new sentences + pretrained.build_vocab(sentences, update=True) + pretrained.train( + sentences, + total_examples=len(sentences), epochs=pretrained.iter + ) + return pretrained + else: + model = models.Word2Vec( + sentences = sentences, + size=100, window=window + ) + return model \ No newline at end of file diff --git a/embeddings/visualise/plot.ipynb b/embeddings/visualise/plot.ipynb new file mode 100644 index 0000000..f1ab7b5 --- /dev/null +++ b/embeddings/visualise/plot.ipynb @@ -0,0 +1,156 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "Python 3.6.9 64-bit ('.env': venv)", + "display_name": "Python 3.6.9 64-bit ('.env': venv)", + "metadata": { + "interpreter": { + "hash": "a9fae5ed66804ddaf38242a1dfb5371fa275897ea3faaaa95a90d47a5a4c7d75" + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from gensim import models\n", + "from sklearn.manifold import TSNE\n", + "\n", + "EMBEDDINGS_PATH = \"../embeddings/with_pretrained_per_review_w15\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model = models.Word2Vec.load(EMBEDDINGS_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def neighbours(word, n=10):\n", + " return [w for w, score in model.wv.most_similar(word, topn=n)]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def reduce_dimensions(words, vectors):\n", + " tsne = TSNE(n_components = 2)\n", + " reduced = tsne.fit_transform(vectors)\n", + " x_values = [row[0] for row in reduced]\n", + " y_values = [row[1] for row in reduced]\n", + " return x_values, y_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "target ='translation'\n", + "nwords = 15\n", + "words = [target] + neighbours(target, nwords)\n", + "w2i = {word: index for index, word in enumerate(words)}\n", + "vectors = [model.wv[word] for word in words]\n", + "x_values, y_values = reduce_dimensions(words, vectors)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 6 + }, + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n\n \n \n \n \n 2020-11-11T10:42:47.240049\n image/svg+xml\n \n \n Matplotlib v3.3.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ4AAAD4CAYAAADcpoD8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8VElEQVR4nO3deVxVdf7A/9dHQgQpl6TGsFIbU9l3WdLUFjQNyTRbxsBqbNGxaX7hYE5qar+cr/5abCrHJrXSEk0zLWdMM8pMcwXFLURxEB1zJRdUwPfvj3u4XRDcgHsB38/H4z4493POPed9rsj7nuW+30ZEUEoppZylgasDUEopdXXRxKOUUsqpNPEopZRyKk08SimlnEoTj1JKKae6xtUBXIoWLVpI69atXR2GUkrVKevXrz8kIj6ujqO8OpF4Wrduzbp161wdhlJK1SnGmD2ujqEieqpNKaWUU2niUUop5VSaeJQqp7i42NUhKFWv1YlrPEpdiXHjxjFz5kx8fHy4+eabCQ8P54EHHmDIkCEcPHgQLy8v3n//fTp06EBycjKNGjVi48aNxMXFceTIETw9Pdm4cSO//PIL06ZN46OPPmLVqlV06tSJGTNmAPDss8+ydu1aCgsL6devH6+88gpguy6ZlJTEokWLKCoqYu7cuXTo0MGF74ZStYce8ah6ae3atcybN4/MzEz+/e9/229OGTx4MG+//Tbr169n0qRJPPfcc/bX7N27lx9//JHXX38dgKNHj7Jq1SreeOMNEhISeOGFF9iyZQubN28mIyMDgFdffZV169axadMmvvvuOzZt2mRfX4sWLdiwYQPPPvsskyZNct7OK1XL6RGPqjcWbMxn4pId7DtWCFmLiYrqRqNGjWjUqBH3338/p0+f5scff6R///7215w5c8Y+3b9/f9zc3OzP77//fowxBAYGcuONNxIYGAiAv78/ubm5hISEMGfOHKZOnUpxcTH79+9n69atBAUFAdC3b18AwsPDmT9/vjPeAqXqBE08ql5YsDGfEfM3U1hUAsCvhUV8s/0YCzbmkxjqC8C5c+do2rSp/WilvMaNG5d57uHhAUCDBg3s06XPi4uL2b17N5MmTWLt2rU0a9aM5ORkTp8+fd7r3dzc9LqRUg70VJuqFyYu2WFPOgAerTpy/Oef+PuXmzlx4gRffvklXl5etGnThrlz5wIgImRmZl7xNn/99VcaN25MkyZNOHDgAP/+97+rvB9KXQ008ah6Yd+xwjLPPVrejufvo1j3xpP07NmTwMBAmjRpwqxZs/jggw8IDg7G39+fL7744oq3GRwcTGhoKB06dODRRx8lLi6uqruh1FXB1IVGcBEREaKVC9SFxE1YTn655HPubCE339CcpcOi6dKlC1OnTiUsLMxFESrlfMaY9SIS4eo4ytMjHlUvpMS3x9PdrcxYwdfvsG/6nwgLC+PBBx/UpKNULaE3F6h6ofQGgtK72m5q6smbH8+0jyulag9NPKreSAz11USjVB2gp9qUUko5lSYepZRSTqWJRylV7Y4dO8a7775bo9tIT0+nd+/eF1wmIyODxYsX12gc6vJVOfEYYxoZY9YYYzKNMVuMMa9Y422MMT8ZY3YaY9KMMQ2tcQ/r+U5rfuuqxqCUql0qSzzOruBwJYlHq0zUvOo44jkDdBeRYCAE6GGMiQb+DrwhIr8HjgJPWss/CRy1xt+wllNK1SOpqank5OQQEhJCZGQknTt3JiEhAT8/PwASExMJDw/H39+fqVOn2l/n7e3NyJEjCQ4OJjo6mgMHDgAwd+5cAgICCA4OpkuXLudtb82aNcTExBAaGkpsbCw7duzg7NmzjBo1irS0NEJCQkhLS+PIkSMkJiYSFBREdHS0vajrmDFjGDhwIHFxcQwcONAJ79BVTkSq7QF4ARuATsAh4BprPAZYYk0vAWKs6Wus5cyF1hseHi5Kqbpj9+7d4u/vLyIi3377rXh5ecmuXbvs8w8fPiwiIqdOnRJ/f385dOiQiIgAsnDhQhERSUlJkXHjxomISEBAgOzdu1dERI4ePWpfb69evUREpKCgQIqKikREZOnSpdK3b18REZk+fboMGTLEvt2hQ4fKmDFjRETkm2++keDgYBERGT16tISFhcmpU6eq941wMWCdVOPf+Op6VMvt1MYYN2A98HvgHSAHOCYipcese4HS+1x9gTwr6RUbYwqA660E5LjOwcBggFtuuaU6wlRK1bDSCuF79uRy5NBJFmzMpykQFRVFmzZt7MtNnjyZzz//HIC8vDyys7O5/vrradiwof26TXh4OEuXLgUgLi6O5ORkHnroIXvVb0cFBQUkJSWRnZ2NMYaioqIK4/vhhx+YN28eAN27d+fw4cP8+uuvACQkJODp6Vldb4W6gGq5uUBESkQkBGgFRAFV7nglIlNFJEJEInx8fKq6OqVUDSutEF5auqi45Bwj5m/mh+yDZSp/p6ens2zZMlatWkVmZiahoaH2qt7u7u4YY4CyVb2nTJnC+PHjycvLIzw8nMOHD5fZ9ssvv0y3bt3Iyspi0aJFZaqEX6ry1clVzanWu9pE5BjwLbZTa02NMaVHVK2AfGs6H7gZwJrfBCj7W6SUqnMcK4Sbhp6cO1tIYVEJs9fmlVmuoKCAZs2a4eXlxfbt21m9evVF152Tk0OnTp0YO3YsPj4+5OWdv05fX9tJldLusADXXnstx48ftz/v3Lkzs2bNAmwJsEWLFlx33XVXtL/qylXHXW0+xpim1rQncA+wDVsC6mctlgSUlgFeaD3Hmr/cOheplKrDHCuEu3leh4evH/s+eI7sRVPKLNejRw+Ki4vp2LEjqampREdHX3TdKSkpBAYGEhAQQGxsLMHBwWXmDx8+nBEjRhAaGlrmrrRu3bqxdetW+80FY8aMYf369QQFBZGamsqHH35Yxb1WV6LK1amNMUHAh4AbtkQ2R0TGGmPaArOB5sBG4A8icsYY0wj4GAgFjgAPi8iuC21Dq1MrVftVVCEcwLepJytTu7sgIlVbq1NX+eYCEdmELYmUH9+F7XpP+fHTQP/y40qpui0lvn2ZLrAAnu5upMS3d2FUqjbSIqFKqWpRUYXwlPj2WrhVnUcTj1Kq2miFcHUptFabUkopp9LEo5RSyqk08SillHIqTTxKKaWcShOPUkopp9LEo5RSyqk08SillHIqTTxKKaWcShOPUkopp9LEU8tV1ru+OqWnp9ubb1WmfO/6hQsXMmHChBqNSylVP2niqeUqSzyOpd+doXziSUhIIDU11akxKKXqB008tVxqaio5OTmEhIQQGRlJ586dSUhIwM/PD4DExETCw8Px9/dn6tSp9td5e3szcuRIgoODiY6O5sCBAwDMnTuXgIAAgoOD6dKly3nbW7NmDTExMYSGhhIbG8uOHTs4e/Yso0aNIi0tzd7XZMaMGQwdOhSA3NxcunfvTlBQEHfddRf//e9/AUhOTmbYsGHExsbStm1bPvvss5p+u5RSdYGI1PpHeHi4XK12794t/v7+IiLy7bffipeXl+zatcs+//DhwyIicurUKfH395dDhw6JiAggCxcuFBGRlJQUGTdunIiIBAQEyN69e0VE5OjRo/b19urVS0RECgoKpKioSEREli5dKn379hURkenTp8uQIUPs23V83rt3b5kxY4aIiHzwwQfSp08fERFJSkqSfv36SUlJiWzZskVuu+22anxnlFIXA6yTWvA3vPxDq1PXUgs25jNxyQ727MnlyKGTLNiYT1MgKiqKNm3a2JebPHkyn3/+OQB5eXlkZ2dz/fXX07BhQ/t1m/DwcJYuXQpAXFwcycnJPPTQQ/Tt2/e87RYUFJCUlER2djbGGIqKii4a66pVq5g/fz4AAwcOZPjw4fZ5iYmJNGjQAD8/P/tRl1Lq6qan2mqhBRvzGTF/s72bY3HJOUbM38wP2Qdp3Lixfbn09HSWLVvGqlWryMzMJDQ0lNOnTwPg7u6OMQYANzc3+zWhKVOmMH78ePLy8ggPD+fw4cNltv3yyy/TrVs3srKyWLRokX19V8rDw8M+LdrhXCmFJp5aaeKSHfYujqahJ+fOFlJYVMLstXlllisoKKBZs2Z4eXmxfft2Vq9efdF15+Tk0KlTJ8aOHYuPjw95eeev09fX1k9lxowZ9vFrr72W48ePV7jO2NhYZs+eDcCsWbPo3LnzJe+rUurqo4mnFtrn0LfezfM6PHz92PfBc2QvmlJmuR49elBcXEzHjh1JTU0lOjr6outOSUkhMDCQgIAAYmNjCQ4OLjN/+PDhjBgxgtDQ0DJ3znXr1o2tW7faby5w9PbbbzN9+nSCgoL4+OOPeeutt65kt5VSVwlTF05/REREyLp161wdhtPETVhuP83myLepJytTu7sgIqVUXWSMWS8iEa6Oozw94qmFUuLb4+nuVmbM092NlPj2LopIKaWqT5UTjzHmZmPMt8aYrcaYLcaY563x5saYpcaYbOtnM2vcGGMmG2N2GmM2GWPCqhpDfZMY6strfQPxbeqJwXak81rfQO1lr5SqF6rjdupi4P8RkQ3GmGuB9caYpUAy8I2ITDDGpAKpwF+BnkA769EJeM/6qRwkhvpqolFK1UtVPuIRkf0issGaPg5sA3yBPsCH1mIfAonWdB/gI+v7TauBpsaYllWNQymlVN1Qrdd4jDGtgVDgJ+BGEdlvzfofcKM17Qs43sO71xorv67Bxph1xph1Bw8erM4wlVJKuVC1JR5jjDcwD/iziPzqOM8q3XBZt8+JyFQRiRCRCB8fn+oKUymllItVS+IxxrhjSzqzRGS+NXyg9BSa9fMXazwfuNnh5a2sMaWUUleB6rirzQAfANtE5HWHWQuBJGs6CfjCYfxx6+62aKDA4ZScUnXOfffdx7Fjxy64zKhRo1i2bNkVrf9S+iUpVZdUx11tccBAYLMxJsMaewmYAMwxxjwJ7AEesuYtBu4DdgKngEHVEINSTldaadexT1Flxo4d64SIlKobquOuth9ExIhIkIiEWI/FInJYRO4SkXYicreIHLGWFxEZIiK3iUigiFw9JQlUnfP6668TEBBAQEAAb775Jrm5ubRv357HH3+cgIAA8vLyaN26NYcOHQJg3LhxtG/fnjvuuINHHnmESZMmAbbeRKX9iFq3bs3o0aMJCwsjMDCQ7du3AxX3QlKqPtK2CEpVYv369UyfPp2ffvoJEaFTp07ceeedZGdn8+GHH55XG2/t2rXMmzePzMxMioqKCAsLIzw8vMJ1t2jRgg0bNvDuu+8yadIk/vWvf9GhQwdWrFjBNddcw7Jly3jppZeYN2+eM3ZVKafSxKNUOaW9kLYvm43XDSEs/fkYiaG+9O3blxUrVnDrrbdWWJB15cqV9OnTh0aNGtGoUSPuv//+SrdR2gspPDzc3svoSnohKVUXaeJRykFpL6TCohIEOH66mBHzN5dZxrEn0pUq7VPk2CuptBfS559/Tm5uLl27dq3ydpSqjbRIqFIOHHshebTy51T2ak6eOsmERRl8/vnnF+w1FBcXZ2+ed+LECb788svL2nZlvZCUqm808SjlwLEXksfvfo93wF3876O/sOHt53jqqado1qxZpa+NjIwkISGBoKAgevbsSWBgIE2aNLnkbVfWC0mp+kb78SjloKq9kE6cOIG3tzenTp2iS5cuTJ06lbAwLcCuXEP78ShVB1S1F9LgwYMJCQkhLCyMBx98UJOOUhXQmwuUclDaimLikh3sO1bITU09SYlvf8ktKj755JOaDE+pekETj1LlaC8kpWqWnmpTSql64lLqBhpjxhpj7r6S9RtjuhpjvrSmk40x/7iS9egRj1JK1XGXUzdQREY5IaQL0iMepZSqA66kbiAQYIz5wRjzqTHmRQBjzAxjTD9rOtcY84oxZoMxZrMxpoM1HmWMWWWM2WiM+dEYU+ndNcaYa40xu632OBhjrnN8XhE94lFKqVruSusGAluAnsAGYH0lqz8kImHGmOeAF4GngO1AZxEptk7L/b/AgxW9WESOG2PSgV7AAuBhYL6IVFrzSY94lFK13rFjx3j33XerdZ1V6ZHkDAs25hM3YTltUr+i35hpdIy5i8aNG+Pt7X3JdQOxNQQ4Diy6wKZKm3euB1pb002AucaYLOANwP8i4f6L31rcDAKmX2hhTTxKqVrvShKPiHDu3LlK548dO5a7776ia+w1rrRmYP6xQgQoKCxi+bZfWLCxbLPm6qgbCJyxfpbw21mwccC3IhIA3A80utAKRGQl0NoY0xVwE5GsCy2viUcpVe1yc3Pp0KEDycnJ3H777Tz22GMsW7aMuLg42rVrx5o1ayrtP7RlyxaioqIICQkhKCiI7OxsUlNTycnJISQkhJSUFAAmTpxIZGQkQUFBjB492r7d8tc9vL29eeGFF/D39+euu+7i4MGDQNkeSevXr+fOO+8kPDyc+Ph49u+3NUXeuXMnd999N8HBwYSFhZGTk1PptquTY81AsNUN/HXHKiYsyuTkyZOXXDcQW5Nob+ByW9g2AUqzXPIlvuYj4BMucrQD/HY3RG1+hIeHi1Kq7ti9e7e4ubnJpk2bpKSkRMLCwmTQoEFy7tw5WbBggfTp00cKCgqkqKhIRESWLl0qffv2FRGRoUOHysyZM0VE5MyZM3Lq1CnZvXu3+Pv729e/ZMkS+eMf/yjnzp2TkpIS6dWrl3z33Xeye/duMcbIqlWr7MsC9vW98sorMmTIEBERSUpKkrlz58rZs2clJiZGfvnlFxERmT17tgwaNEhERKKiomT+/PkiIlJYWCgnT56sdNvVqfVfv5Rbyz2adXtS3FvcIv7+/vLGG2+c956IiNx6661y8OBBEREZPXq0AKeBFcA84I+2t4MZQD9rOhdoYU1HAOnWdAzwM7ARGA/kWuNdgS+t6WTgH2L9nQZ+BxQCTeUif9P15gKlVLUo7WO071ghzaWAG266mcDAQAD70YYxhsDAQHJzcyvtPxQTE8Orr77K3r176du3L+3atTtvW19//TVff/01oaGhgK1GXnZ2Nrfccst51z0aNGjAgAEDAPjDH/5g74VUaseOHWRlZXHPPfcAUFJSQsuWLTl+/Dj5+fk88MADADRq1OiC2+7SpUu1vZc3NfU8r2bgdVEP0PHeR8vUDMzKKntGKzc31z794osv8sorr2QB8cD3WDcXiEhy6TIi0tpheh22xIKIrAJud1j136zxdCDdmp6BLYmVugP4TESOXWz/NPEoparMsY8RwIFfT3P4tLBgYz6Job40aNDA3oOoQYMGFBcXV9p/6NFHH6VTp0589dVX3Hffffzzn/+kbdu2ZbYnIowYMYKnn366zHhubu5Fr3sYY85bl7+/P6tWrSozfvz48QpfX9m2q1NKfPsy7ydcXs1AsNUNBPyw3dH2oYhsqO44Sxlj3sZ299x9l7K8XuNRSlVZ+WsSYPsDPXHJjkpfU1n/oV27dtG2bVuGDRtGnz592LRpE9dee22ZRBAfH8+0adM4ceIEAPn5+fzyyy8VbufcuXP2azmffPIJd9xxR5n57du35+DBg/bEU1RUxJYtW7j22mtp1aoVCxYsAODMmTOcOnXqsrZ9pRJDfXmtbyC+TT0x2Kqjv9Y38LJKOVl1A7eKSAcRea1aAyxHRP4kIr8XkZ8vZXk94lFKVdm+ClpJXGgcbP2HkpKSGD9+PL169bKPz5kzh48//hh3d3d+97vf8dJLL9G8eXPi4uIICAigZ8+eTJw4kW3bthETEwOAt7c3M2fOxM3N7bztNG7cmDVr1jB+/HhuuOEG0tLSysxv2LAhn332GcOGDaOgoIDi4mL+/Oc/4+/vz8cff8zTTz/NqFGjcHd3Z+7cudx7770VbvuGG2647PftQupzzcBq6cdjjJmG7a6JX8R2+x3GmOZAGrb7wnOBh0TkqLEd576F7ZDsFJB8sUNA7cejVO1W1T5GNcnb29t+dHK1qe/9eGYAPcqNpQLfiEg74BvrOdjOA7azHoOB96opBqWUi1S1j5G6ulRL4hGR74Ej5Yb7AB9a0x8CiQ7jH1l3/60GmhpjWlZHHEop16iOaxI15Wo92qnNavIaz40ist+a/h9wozXtC+Q5LLfXGtvvMIYxZjC2IyJuueWWGgxTKVUd6vM1CVW9nHJXm9guJF3WxSQRmSoiESIS4ePjU0ORKaWUcraaTDwHSk+hWT9L7zfMB252WK4Vv5VmUEopVc/VZOJZCCRZ00nAFw7jjxubaKDA4ZScUkqpeq5arvEYYz7FVmqhhTFmLzAamADMMcY8CewBHrIWX4ztVuqd2G6nHnTeCpVSStVb1ZJ4ROSRSmbdVcGyAgypju0qpZSqe7RkjlJKKafSxKOqbMyYMUyaNKnS+RkZGSxevPii6/H29q7OsJRStZQmHlXjLjXxKKWuDpp41BV59dVXuf3227njjjvsnSO7du1KaU29Q4cO0bp1a86ePcuoUaNIS0sjJCSEtLQ0Tpw4waBBgwgMDCQoKIh58+bZ1zty5EiCg4OJjo7mwIEDLtk3pVTNuioSz5X0a79c6enp9O594e6yV/rJ3/EPem2wfv16Zs+ebd+ftWvXVrpsw4YNGTt2LAMGDCAjI4MBAwYwbtw4mjRpwubNm9m0aRPdu9uKSJ48eZLo6GgyMzPp0qUL77//vrN2SSnlRFd14ikuLnZqHHX5lNOCjfnETVhOm9Sv6Dd6Gh1j7sLLy4vrrruOhISEy1rXsmXLGDLktxsbmzVrBtiSVGnyDg8PL9NNUSlVf1wViSc1NZWcnBxCQkKIjIykc+fOJCQk4OfnB0BiYiLh4eH4+/szdepU++u8vb0rPPUzd+5cAgICCA4OrrDd7Zo1a4iJiSE0NJTY2Fh27NhR4SmnkydP8sQTTxAVFUVoaChffGH7jm1hYSEPP/wwHTt25IEHHqCwsPKeJs5Q2l0y/1ghAhQUFrF82y8s2Fi24MQ111zDuXPnADh9+vRlb8fd3d3eHdLNzc3pHwyUUs5xVSSeCRMmcNttt5GRkcHEiRPZsGEDb731Fj//bGuWN23aNNavX8+6deuYPHkyhw8fBio/9TN27FiWLFlCZmYmCxcuPG97HTp0YMWKFWzcuJGxY8fy0ksvVXjK6dVXX6V79+6sWbOGb7/9lpSUFE6ePMl7772Hl5cX27Zt45VXXmH9+vXOe7MqUL67pMfN/vy6YxUTvtzE8ePHWbRoEQCtW7e2x1ra8RE4r3vkPffcwzvvvGN/fvTo0ZreBaVULVKvE0/p6aE7/r6cXYdO2j+hR0VF0aZNG/tykydPth/V5OXlkZ2dDVR+6icuLo7k5GTef/99SkrKtvsFW0vf/v37ExAQwAsvvMCWLVsqjO/rr79mwoQJhISE0LVrV06fPs1///tfvv/+e/7whz8AEBQURFBQULW9J1eifBdJj9/9nsYdOrP+jafo2bMnkZGRALz44ou89957hIaGcujQIfvy3bp1Y+vWrfYjvb/97W8cPXrUftT47bffOnV/lFKuVW9bX5eeHir9pF5cco4R8zfz2C3Hady4sX259PR0li1bxqpVq/Dy8rInAKj81M+UKVP46aef+OqrrwgPDz/viOTll1+mW7dufP755+Tm5tK1a9cKYxQR5s2bR/v2tbtZ1k1NPc/rLtkkdgB+9yXzQ7nukps2bbJPjx8/HoDmzZufdwPChx9+SHmOfVP69etHv379qhy7Uqr2qbdHPI6nh0xDT86dLaSwqITZa/PKLFdQUECzZs3w8vJi+/btrF69+qLrzsnJoVOnTowdOxYfHx/y8s5fp6+vrS/JjBkz7OPlTznFx8fz9ttvU9p+fOPGjQB06dKFTz75BICsrKwyf8xdQbtLKqWqU71NPI6nh9w8r8PD1499HzxH9qIpZZbr0aMHxcXFdOzYkdTUVKKjoy+67pSUFAIDAwkICCA2Npbg4OAy84cPH86IESMIDQ0tc4G8/Cmnl19+maKiIoKCgvD39+fll18G4Nlnn+XEiRN07NiRUaNGER4eXpW3ospqc3dJpVTdY0o/bddmERERcrnfY4mbsPy800Ng+6O5stzpIaWUqo+MMetFJMLVcZRXb4949PSQUkrVTvX25oLS00ATl+xg37FCbmrqSUp8ez09pJRSLlZvEw/Yko8mGqWUql3q7ak2pa42o0aNYtmyZa4OQ6mLqtdHPErVNyKCiNCgwfmfGceOHeuCiJS6fHrEo5QLpKamlikbVNpMb+LEiURGRhIUFMTo0aMByM3NpX379jz++OMEBASQl5dHcnIyAQEBBAYG8sYbbwCQnJxsL1X0zTffEBoaSmBgIE888QRnzpwBbGWNRo8eTVhYGIGBgWzfvt3Je66UJh6lXGLAgAHMmTPH/nzOnDn4+PiQnZ3NmjVryMjIYP369Xz//fcAZGdn89xzz7FlyxYOHTpEfn4+WVlZbN68mUGDBpVZ9+nTp0lOTiYtLY3NmzdTXFzMe++9Z5/fokULNmzYwLPPPnvBzrFK1RRNPEo5UWn9wL5p+1i7LZdpX68nMzOTZs2asXnzZr7++mtCQ0MJCwtj+/bt9rqBt956q/3LzW3btmXXrl386U9/4j//+Q/XXXddmW3s2LGDNm3acPvttwOQlJRkT2AAffv2BbT1hHIdl13jMcb0AN4C3IB/icgEV8WilDOUrx/YsF0sI/6/f9HZ9xoGDBjAnj17GDFiBE8//XSZ1+Xm5papL9isWTMyMzNZsmQJU6ZMYc6cOUybNu2S4/Dw8AC09YRyHZcc8Rhj3IB3gJ6AH/CIMcbPFbEo5Szl20t4dehMQVY6Xy38nP79+xMfH8+0adPsxVLz8/P55ZdfzlvPoUOHOHfuHA8++CDjx49nw4YNZea3b9+e3Nxcdu7cCcDHH3/MnXfeWYN7ptTlcdURTxSwU0R2ARhjZgN9gK0uikepGle+vURDn1s5d7YQt8bNadmyJS1btmTbtm3ExMQAtkaEM2fOxM2tbAWO/Px8Bg0aZG+699prr5WZ36hRI6ZPn07//v0pLi4mMjKSZ555pgb3TKnL45JabcaYfkAPEXnKej4Q6CQiQx2WGQwMBrjlllvC9+zZ4/Q4lapOWj9QOZvWartMIjJVRCJEJMLHx8fV4ShVZVo/UCkbV51qywdudnjeyhpTqt7S+oFK2bgq8awF2hlj2mBLOA8Dj7ooFqWcRusHKuWixCMixcaYocASbLdTTxORLa6IRSmllHO57Hs8IrIYWOyq7SullHKNWntzgVJK1YRjx47x7rvv1ug20tPT6d279wWXycjIYPHi3z57L1y4kAkTro7v0WviUUpdVSpLPM6u4lA+8SQkJJCamurUGFxFE49S6qqSmppKTk4OISEhREZG0rlzZxISEvDzsxVPSUxMJDw8HH9/f6ZOnWp/nbe3NyNHjiQ4OJjo6GgOHDgAwNy5cwkICCA4OJguXbqct701a9YQExNDaGgosbGx7Nixg7NnzzJq1CjS0tIICQkhLS2NGTNmMHSo7auMubm5dO/enaCgIO666y7++9//ArYK5MOGDSM2Npa2bdvaq5Hv37+fLl26EBISQkBAACtWrKjR97DKSvt71OZHeHi4KKVUddi9e7f4+/uLiMi3334rXl5esmvXLvv8w4cPi4jIqVOnxN/fXw4dOiQiIoAsXLhQRERSUlJk3LhxIiISEBAge/fuFRGRo0eP2tfbq1cvEREpKCiQoqIiERFZunSp9O3bV0REpk+fLkOGDLFv1/F57969ZcaMGSIi8sEHH0ifPn1ERCQpKUn69esnJSUlsmXLFrnttttERGTSpEkyfvx4EREpLi6WX3/9tTTmdVIL/oaXf2gjOKXUVWHBxnwmLtnBnj25HDl0kgUb82kKREVF0aZNG/tykydP5vPPPwcgLy+P7Oxsrr/+eho2bGi/bhMeHs7SpUsBiIuLIzk5mYceeshe+dtRQUEBSUlJZGdnY4yhqKjoorGuWrWK+fPnAzBw4ECGDx9un5eYmEiDBg3w8/OzH3VFRkbyxBNPUFRURGJiIiEhIVfyFjmNnmpTStV7pZXBS0sWFZecY8T8zfyQfbBM5e/09HSWLVvGqlWryMzMJDQ0lNOnTwPg7u6OMQYoW9l7ypQpjB8/nry8PMLDwzl8+HCZbb/88st069aNrKwsFi1aZF/flSqtLg62M1YAXbp04fvvv8fX15fk5GQ++uijKm2jpmniUUrVe46VwU1DT86dLaSwqITZa/PKLFdQUECzZs3w8vJi+/btrF69+qLrzsnJoVOnTowdOxYfHx/y8s5fp6+v7UvDM2bMsI9fe+21HD9+vMJ1xsbGMnv2bABmzZpF586dLxjDnj17uPHGG/njH//IU089dV7F8tpGE49Sqt5zrAzu5nkdHr5+7PvgObIXTSmzXI8ePSguLqZjx46kpqbam+9dSEpKCoGBgQQEBBAbG0twcHCZ+cOHD2fEiBGEhoaWuXOuW7dubN261X5zgaO3336b6dOnExQUxMcff8xbb711wRjS09MJDg4mNDSUtLQ0nn/++YvG7UouqU59uSIiImTdunWuDkMpVUddrZXBtTq1Ukq5iFYGr130rjalVL2nlcFrF008SqmrglYGrz30VJtSSimn0sSjlFLKqTTxKKWUcipNPEoppZxKE49SSimn0sSjlFLKqTTxqFplzJgxTJo0qdrWFxsbW23rUkpVD008ql778ccfXR2CUqocTTzKpT766COCgoIIDg5m4MCBZea9//77REZGEhwczIMPPsipU6eAijs+btmyhaioKEJCQggKCiI7OxuwdY0s9fe//53AwECCg4OvmhbDStVKVekiB/QHtgDngIhy80YAO4EdQLzDeA9rbCeQeinb0Q6k9VNWVpa0a9dODh48KCK2zo+jR4+WiRMniojYOz+KiIwcOVImT54sIhV3fBw6dKjMnDlTRETOnDkjp06dEhGRxo0bi4jI4sWLJSYmRk6ePGnfllL1HfW0A2kW0Bf4p+OgMcYPeBjwB24ClhljbrdmvwPcA+wF1hpjForI1irGoeqI0i6Q+44VYrb+h7DOPWjRogUAzZs3L7NsVlYWf/vb3zh27BgnTpwgPj4eqLjjY0xMDK+++ip79+6lb9++tGvXrsy6li1bxqBBg/Dy8qpwW0op56nSqTYR2SYiOyqY1QeYLSJnRGQ3tqObKOuxU0R2ichZYLa1rLoKOHaBFOBYYRHpO35hwcb8CpdPTk7mH//4B5s3b2b06NH2zo0VdXx89NFHWbhwIZ6entx3330sX77ciXumlLocNXWNxxdwbMO31xqrbPw8xpjBxph1xph1Bw8erKEwlTM5doEEaHRLEAVbV/D/zl8DwJEjR8osf/z4cVq2bElRURGzZs2yj1fU8XHXrl20bduWYcOG0adPHzZt2lRmXffccw/Tp0+3Xycqvy2llPNcNPEYY5YZY7IqeNTokYqITBWRCBGJ8PHxqclNKSfZV64RV0OfW2kSM4CMKX8mODiYv/zlL2Xmjxs3jk6dOhEXF0eHDh3s4xV1fJwzZw4BAQGEhISQlZXF448/XmZdPXr0ICEhgYiICEJCQqr1lm2l1OWplg6kxph04EURWWc9HwEgIq9Zz5cAY6zFx4hIfEXLVUY7kNYPV2sXSKVc5WrrQLoQeNgY42GMaQO0A9YAa4F2xpg2xpiG2G5AWFhDMahaRrtAKqWgio3gjDEPAG8DPsBXxpgMEYkXkS3GmDnAVqAYGCIiJdZrhgJLADdgmohsqdIeqDpDu0AqpaCaTrXVND3VppRSl+9qO9WmlFJKVUgTzyWaPHkyHTt25LHHHquR9ScnJ/PZZ5/VyLqVUqo20cRzid59912WLl1a5vskxcXFLoxIqUszY8YMhg4dCti+fPvRRx/Zx/ft22df7qmnnmLrVi0iomqeJp5L8Mwzz7Br1y569uxJkyZNGDhwIHFxcQwcOJCDBw/y4IMPEhkZSWRkJCtXrgRs5f2feOIJunbtStu2bZk8ebJ9fZUVxvz++++JjY2lbdu2evSjasQzzzxj/45T+cTzr3/9Cz8/P1eFpq4mri4WdymP2lAk9NZbb5WDBw/K6NGjJSwszF6E8pFHHpEVK1aIiMiePXukQ4cOIiIyevRoiYmJkdOnT8vBgwelefPmcvbs2QoLY4qIJCUlSb9+/aSkpES2bNkit912mwv2UtVFH3/8sURGRkpwcLAMHjxYiouLZdq0adKuXTuJjIyUp556SoYMGSIiYi/COnfuXGncuLHcfvvtEhwcLKdOnZI777xT1q5dKyIin3zyiQQEBIi/v78MHz7cvq3GjRvLSy+9JEFBQdKpUyf53//+55J9VpeGWlokVI94LmDBxnziJiynTepX/K/gNIs37QcgISEBT09PwFZ8cujQoYSEhJCQkMCvv/7KiRMnAOjVqxceHh60aNGCG264gQMHDrB8+XL69+9fYWHMxMREGjRogJ+fHwcOHHDy3qq6aNu2baSlpbFy5UoyMjJwc3Nj5syZjB49mpUrV/LDDz9UePqsX79+REREMGvWLDIyMuy/zwD79u3jr3/9K8uXLycjI4O1a9eyYMECAE6ePEl0dDSZmZl06dKF999/31m7quqRqlanrrdKC1qW1hYrPieM+2or4Sd+JaLdTfblzp07x+rVq2nUqNF56/Dw8LBPu7m5XfSakOPyUgduc1euUbbC9xKOrV5LZGQkAIWFhfz444907dqV0lJTAwYM4Oeff77k9a9du7bM6x977DG+//57EhMTadiwIb179wYgPDycpUuXVvPeqauBHvFUonxBS4DTRSWszDlcZuzee+/l7bfftj/PyMi44Hq7d+/O3LlzOXzYth4tVqkux/kVvs9ibr+TMdO/IiMjgx07djBmzJga2767uzvGGODSPkwpVRFNPJUoX9Cy1PHTRWWeT548mXXr1hEUFISfnx9Tpky54Hr9/f0ZOXIkd955Z4WFMZW6kPMqfN8azK/bVvDqvNWA7YNMaGgo3333HYcPH6aoqIi5c+dWuK5rr72W48ePnzceFRXFd999x6FDhygpKeHTTz/lzjvvrJkdUlclPdVWiZuaepYpaNnq2WkA+Pd+ihdf/K2gZYsWLUhLSzvv9eU/dWZlZdmnk5KSSEpKKjN/xowZZZ6XXidSytF5Fb5b3ELTzgPJfD+FoIWv4O7uzjvvvMOYMWOIiYmhadOmhISEVLiu5ORknnnmGTw9PVm1apV9vGXLlkyYMIFu3bohIvTq1Ys+fbRtlqo+WjKnEuWv8YCtoOVrfQO1tphyGa3wrS6HlsypYxJDfXmtbyC+TT0x2P5ja9JRrqYVvlV9oKfaLiAx1FcTjapVtMK3qg808ShVx+gHIlXX6ak2pZRSTqWJRymllFNp4lFKKeVUmniUUko5lSYepZRSTqWJRymllFNp4lFKKeVUVUo8xpiJxpjtxphNxpjPjTFNHeaNMMbsNMbsMMbEO4z3sMZ2GmNSq7J9pZRSdU9Vj3iWAgEiEgT8DIwAMMb4AQ8D/kAP4F1jjJsxxg14B+gJ+AGPWMsqpZS6SlQp8YjI1yJS2pBjNdDKmu4DzBaRMyKyG9gJRFmPnSKyS0TOArOtZZVSSl0lqvMazxPAv61pXyDPYd5ea6yy8fMYYwYbY9YZY9YdPHiwGsNUSinn0EZ5Fbto4jHGLDPGZFXw6OOwzEigGJhVXYGJyFQRiRCRiNIWvEopVZPGjRtH+/btueOOO3jkkUeYNGkSOTk59OjRg/DwcDp37sz27dsBWz+jYcOGERsbS9u2bfnss88ASE9Pp3PnziQkJODn50dJSQkpKSlERkYSFBTEP//5T1fuYq1w0SKhInL3heYbY5KB3sBd8ltzn3zgZofFWlljXGBcKaVcZu3atcybN4/MzEyKiooICwsjPDycwYMHM2XKFNq1a8dPP/3Ec889x/LlywHYv38/P/zwA9u3bychIYF+/foBsGHDBrKysmjTpg1Tp06lSZMmrF27ljNnzhAXF8e9995LmzZtXLm7LlWl6tTGmB7AcOBOETnlMGsh8Ikx5nXgJqAdsAYwQDtjTBtsCedh4NGqxKCUUlWxYGM+E5fsYNvS2TS+Poj/bDtMYqgv999/P6dPn+bHH3+kf//+9uXPnDljn05MTKRBgwb4+flx4MAB+3hUVJQ9sXz99dds2rTJfkRUUFBAdna2Jp4q+AfgASw1xgCsFpFnRGSLMWYOsBXbKbghIlICYIwZCiwB3IBpIrKlijEopdQVKd9p+PjpYkbM32yff+7cOZo2bUpGRkaFr/fw8LBPO3Zzbty4cZnxt99+m/j4eJRNVe9q+72I3CwiIdbjGYd5r4rIbSLSXkT+7TC+WERut+a9WpXtK6VUVUxcssOedDxadaQwZw2nCguZsDCDL7/8Ei8vL9q0acPcuXMBWxLJzMy8rG3Ex8fz3nvvUVRUBMDPP//MyZMnq3dH6hhtBKeUumrtO1Zon/ZoeTuev49i37ShHGjclF5RgTRp0oRZs2bx7LPPMn78eIqKinj44YcJDg6+5G089dRT5ObmEhYWhojg4+PDggULamBv6g7jeHhYW0VERMi6detcHYZSqp6Jm7CcfIfkc+5sIQ0aevI7L0PxwlFMnTqVsLAwF0ZYNcaY9SIS4eo4ytMjHqXUVSslvn2ZazyH//MPSo7kUexpGDL4yTqddGozTTxKqatWYqjt++sTl+xg37FCQh4fRUp8e/u4qhmaeJRSV7XEUF9NNE6mbRGUUko5lSYepZRSTqWJRymllFNp4lFKKeVUmniUUko5lSYeVavFxsZecL63t7eTIlFKVRdNPKpW+/HHH10dglKqmmniUVVy8uRJevXqRXBwMAEBAaSlpdG6dWuGDx9OYGAgUVFR7Ny5E4BFixbRqVMnQkNDufvuu+1l5MeMGcMTTzxB165dadu2LZMnT7avv/SIZv/+/XTp0oWQkBACAgJYsWKFfZmRI0cSHBxMdHR0mdL0SqnaSROPqpL//Oc/3HTTTWRmZpKVlUWPHj0AaNKkCZs3b2bo0KH8+c9/BuCOO+5g9erVbNy4kYcffpj/+7//s69n+/btLFmyhDVr1vDKK6/YK/mW+uSTT4iPjycjI4PMzExCQkIAW+KLjo4mMzOTLl268P777ztlv5VSV04rF6grUto8a8+uwxz6bBGHi57jhScfoXPnzgA88sgj9p8vvPACAHv37mXAgAHs37+fs2fPlmmE1atXLzw8PPDw8OCGG27gwIEDtGrVyj4/MjKSJ554gqKiIhITE+2Jp2HDhvTu3RuA8PBwli5d6ozdV0pVgR7xqMtW2jwr/1gh1zT3xefxN1l9zJtn/pzC2LFjAbAaA5aZ/tOf/sTQoUPZvHkz//znPzl9+rR9GceGWm5ubhQXF5fZZpcuXfj+++/x9fUlOTmZjz76CAB3d3f7+it6nVKq9tHEoy6bY/Os4uOHaeDuQcMOd3Iu4H42bNgAQFpamv1nTEwMYGv56+trq4n14YcfXtY29+zZw4033sgf//hHnnrqKft2lFJ1j55qU5fNsXlW0cFcfkmfDsZgGlzDx4s+oV+/fhw9epSgoCA8PDz49NNPAdtNBP3796dZs2Z0796d3bt3X/I209PTmThxIu7u7nh7e9uPeJRSdY82glOXrXzzrFK+TT1Zmdqd1q1bs27dOlq0aOGC6JRSpWprIzg91aYuW0p8ezzd3cqMebq7kRLf3kURKaXqEj3Vpi5b+eZZNzX1LNM8Kzc314XRKaVquyolHmPMOKAPcA74BUgWkX3GdpvRW8B9wClrfIP1miTgb9YqxovI5V1lVrWCNs9SSl2pqp5qmygiQSISAnwJjLLGewLtrMdg4D0AY0xzYDTQCYgCRhtjmlUxBqWUUnVIlRKPiPzq8LQxUHqnQh/gI7FZDTQ1xrQE4oGlInJERI4CS4EeVYlBKaVU3VLlmwuMMa8aY/KAx/jtiMcXyHNYbK81Vtl4ResdbIxZZ4xZd/DgwaqGqdRVLzc3l4CAAADWrVvHsGHDKl02PT3dXhFCqep20cRjjFlmjMmq4NEHQERGisjNwCxgaHUFJiJTRSRCRCJ8fHyqa7VKKSAiIqJMMValnOmiiUdE7haRgAoeX5RbdBbwoDWdD9zsMK+VNVbZuFLqAmbOnElUVBQhISE8/fTTlJSU4O3tXWFl7pycHKKjowkMDORvf/tbhT2LHI9ovvvuO0JCQggJCSE0NJTjx48DcOLECfr160eHDh147LHHqAvf+VN1Q5VOtRlj2jk87QNst6YXAo8bm2igQET2A0uAe40xzaybCu61xpRSldi2bRtpaWmsXLmSjIwM3NzcmDVrVqWVuZ9//nmef/55Nm/eXKbQamUmTZrEO++8Q0ZGBitWrMDT0xOAjRs38uabb7J161Z27drFypUra3Q/1dWjqtd4Jlin3TZhSyLPW+OLgV3ATuB94DkAETkCjAPWWo+x1phSqpwFG/OJm7CcO4a9xZLvVnF7gO2o5JtvvmHXrl3nVeYu/f7UqlWr6N+/PwCPPvroRbcTFxfHX/7yFyZPnsyxY8e45hrbtyyioqJo1aoVDRo0ICQkRL+fpapNlb7HIyIPVjIuwJBK5k0DplVlu0rVd6UVwAuLShDA078bje5+kjF9A+3fn5o0aVK1VOZOTU2lV69eLF68mLi4OJYssZ2EuFjFcKWulJbMUaoWcqwA3ujWYE7tWMmJY4eZuGQHR44cYc+ePZW+Njo6mnnz5gEwe/bsi24rJyeHwMBA/vrXvxIZGcn27dsv+hqlqkITj1K1kGMF8IYtbqFp54EcmPMya19/knvuuYf9+/dX+to333yT119/naCgIHbu3EmTJk0uuK0333yTgIAAgoKCcHd3p2fPntW2H0pVRKtTK1ULXawC+IWcOnUKT09PjDHMnj2bTz/9lC++KH8Tqroa1Nbq1FokVKlaKCW+vf0aT6lLrQC+fv16hg4diojQtGlTpk3TS6qqdtHEo1QtdLEK4BfSuXNnMjMzazpEpa6YJh6laimtAK7qK725QCmllFNp4lFKKeVUmniUUko5lSYepZRSTqWJRymllFPViS+QGmMOApXXCKkdWgCHXB1EFWj8rlWX46/LsUP9jv9WEal1Dc3qROKpC4wx62rjN4QvlcbvWnU5/rocO2j8rqCn2pRSSjmVJh6llFJOpYmn+kx1dQBVpPG7Vl2Ovy7HDhq/0+k1HqWUUk6lRzxKKaWcShOPUkopp9LEc5mMMeOMMZuMMRnGmK+NMTdZ48YYM9kYs9OaH+bwmiRjTLb1SHJd9GCMmWiM2W7F+LkxpqnDvBFW/DuMMfEO4z2ssZ3GmFSXBP5bLP2NMVuMMeeMMRHl5tX6+MurzbGVMsZMM8b8YozJchhrboxZav1OLzXGNLPGK/1/4CrGmJuNMd8aY7ZavzvPW+N1Yh+MMY2MMWuMMZlW/K9Y422MMT9ZcaYZYxpa4x7W853W/NaujL9CIqKPy3gA1zlMDwOmWNP3Af8GDBAN/GSNNwd2WT+bWdPNXBj/vcA11vTfgb9b035AJuABtAFyADfrkQO0BRpay/i5MP6OQHsgHYhwGK8T8Zfbl1obW7k4uwBhQJbD2P8BqdZ0qsPvUYX/D1wcf0sgzJq+FvjZ+n2pE/tgxeFtTbsDP1lxzQEetsanAM9a0885/F16GEhz9b9B+Yce8VwmEfnV4WljoPTujD7AR2KzGmhqjGkJxANLReSIiBwFlgI9nBq0AxH5WkSKraergVbWdB9gtoicEZHdwE4gynrsFJFdInIWmG0t6xIisk1EdlQwq07EX05tjs1ORL4HjpQb7gN8aE1/CCQ6jFf0/8BlRGS/iGywpo8D2wBf6sg+WHGcsJ66Ww8BugOfWePl4y/dr8+Au4wxxjnRXhpNPFfAGPOqMSYPeAwYZQ37AnkOi+21xiobrw2ewPbJDupm/I7qYvy1ObaLuVFE9lvT/wNutKZr9T5Zp51CsR011Jl9MMa4GWMygF+wfXjNAY45fIh0jNEevzW/ALjeqQFfhCaeChhjlhljsip49AEQkZEicjMwCxjq2mjPd7H4rWVGAsXY9qFWuZT4Ve0htnM6tf57GcYYb2Ae8OdyZy5q/T6ISImIhGA7QxEFdHBtRFWjra8rICJ3X+Kis4DFwGggH7jZYV4raywf6FpuPL3KQV7AxeI3xiQDvYG7rP9wUHn8XGC8RlzG+++o1sR/GS4Uc213wBjTUkT2W6ehfrHGa+U+GWPcsSWdWSIy3xquU/sAICLHjDHfAjHYTgFeYx3VOMZYGv9eY8w1QBPgsEsCroQe8VwmY0w7h6d9gO3W9ELgceuOmGigwDqMXwLca4xpZt01c6815hLGmB7AcCBBRE45zFoIPGzdEdMGaAesAdYC7aw7aBpiu1i50NlxX4K6GH9tju1iFgKld2gmAV84jFf0/8BlrOsbHwDbROR1h1l1Yh+MMT7GuvvUGOMJ3IPtOtW3QD9rsfLxl+5XP2C5wwfM2sHVdzfUtQe2T01ZwCZgEeArv9158g62c6+bKXvH1RPYLnbvBAa5OP6d2M7/ZliPKQ7zRlrx7wB6Oozfh+1OoBxgpIvjfwDb+ewzwAFgSV2Kv4L9qbWxOcT4KbAfKLLe+yexXTP4BsgGlgHNrWUr/X/gwvjvwHYabZPD7/19dWUfgCBgoxV/FjDKGm+L7cPVTmAu4GGNN7Ke77Tmt3X1v0H5h5bMUUop5VR6qk0ppZRTaeJRSinlVJp4lFJKOZUmHqWUUk6liUcppZRTaeJRSinlVJp4lFJKOdX/D+sx21DPmnKqAAAAAElFTkSuQmCC\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "for i, word in enumerate(words):\n", + " plt.annotate(word, (x_values[i], y_values[i]))\n", + "\n", + "plt.scatter(x_values, y_values)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "POSITIVE: wonderful, fantastic, fun, brilliant, perfect, fascinating, amazing, nice, lovely, full\n\nNEGATIVE: repetitive, dull, predictable, poorly, unusual, strangely, sparse, abrupt, intriguing, anticlimactic\n" + ] + } + ], + "source": [ + "def most_similar(positive, negative, n=10):\n", + " return [w for w, score in model.wv.most_similar(positive=positive, negative=negative, topn=n)]\n", + "\n", + "pos_words = most_similar(['good', 'great'], ['bad'])\n", + "neg_words = most_similar(['boring', 'stilted'], ['good'])\n", + "\n", + "print('POSITIVE:', ', '.join(pos_words))\n", + "print()\n", + "print('NEGATIVE:', ', '.join(neg_words))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/model/review-level.R b/model/review-level.R new file mode 100644 index 0000000..145527b --- /dev/null +++ b/model/review-level.R @@ -0,0 +1,263 @@ +library(lme4) +library(ggplot2) +library(reshape2) +library(dplyr) +library(rstatix) # requires "coin" library, too +library(Hmisc) + +# import data + +input_path = "../data/goodreads_review_data.csv" +data = read.csv(input_path) + +#### test if translated books have different star rating than non-translated +describe(subset(data, is_translated==1)$rating_no) +# mean 3.786, poprortion 1 0.058 2 0.101 3 0.192 4 0.296 5 0.354 +describe(subset(data, is_translated==0)$rating_no) +# mean 3.85 proportion 1 0.059 2 0.091 3 0.175 4 0.293 5 0.383 + + +wilcox.test(subset(data, is_translated==1)$rating_no, + subset(data, is_translated==0)$rating_no, + alternative = "two.sided") +# significant difference + +data %>% wilcox_effsize(rating_no ~ is_translated) +# but small effect size + +# filters + +min_review_length = function(data, min_length = 1) { + subset(data, data$words >= min_length) +} + +only_translated = function(data) { + subset(data, data$is_translated) +} + +# counting functions + +count_translation_mentions = function(data, absolute = FALSE) { + values = subset(data$mentions_translation, !is.na(data$mentions_translation)) + if (absolute) { + return(sum(values)) + } + else { + return(sum(values)/length(values)) + } +} + +translation_frequency = function(data) { + all_data = subset(data, data$words >= 10) + mention_data = subset(all_data, as.logical(all_data$mentions_translation)) + total_mentions = sum(mention_data$mention_count) #to do: add column with count instead of bool + total_words = sum(all_data$words) + + total_mentions / total_words +} + +count_reviews = function(data) { + nrow(data) +} + +count_titles = function(data) { + length(unique(data$book_title)) +} + +#quick overview of how many reviews are from translated books and how many mention translation + +table(data[,c("mentions_translation", "is_translated")]) + +# full overview of data per language + +original_languages = unique(data$original_language[as.character(data$original_language) != ""]) +edition_languages = unique(data$edition_language[as.character(data$edition_language) != ""]) + +full_table = function(data) { + res = data.frame() + + for (og_lang in original_languages) { + for (ed_lang in edition_languages) { + subdata = subset(data, + data$original_language == og_lang & data$edition_language == ed_lang) + new_row = data.frame(original_language = og_lang, + edition_language = ed_lang, + is_translated = as.character(og_lang) != as.character(ed_lang), + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + } + + return(res) +} + +oglang_table = function(data) { + res = data.frame() + + for (og_lang in original_languages) { + subdata = subset(data, + data$original_language == og_lang & as.character(data$edition_language) != as.character(data$original_language)) + new_row = data.frame(original_language = og_lang, + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + + return(res) +} + +edlang_table = function(data) { + res = data.frame() + + for (ed_lang in edition_languages) { + subdata = subset(data, + data$edition_language == ed_lang & as.character(data$edition_language) != as.character(data$original_language)) + new_row = data.frame(edition_language = ed_lang, + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + + return(res) +} + +editions <- edlang_table(table) + + +# rating vs mentioning of translation + + +ratings = 1:5 + +rating_data = rbind( + data.frame( + is_translated = rep("translated", length(ratings)), + rating = ratings, + n_reviews = sapply(ratings, + function(r) { + nrow(subset(data, data$rating_no == r & data$is_translated)) + }), + translation_freq = sapply(ratings, + function (r) { + translation_frequency(subset(data, data$rating_no == r & data$is_translated)) + }) + ), + data.frame( + is_translated = rep("not translated", length(ratings)), + rating = ratings, + n_reviews = sapply(ratings, + function(r) { + nrow(subset(data, data$rating_no == r & ! data$is_translated)) + }), + translation_freq = sapply(ratings, + function (r) { + translation_frequency(subset(data, data$rating_no == r & ! data$is_translated)) + }) + ) +) +rating_data + +#plot + +ggplot(data = rating_data) + + geom_line(aes(x = rating, y = translation_freq, color = is_translated), size = 1) + + ylim(c(0, NA)) + + labs(x = "rating", y = "frequency of 'translation'", color = "edition") + +#model + +mention_model = glm(mentions_translation ~ rating_no * is_translated, data, family = binomial) +summary(mention_model) + +rating_data_test <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data = rating_data_test) + + geom_line(aes(x = rating_no, y = mention_count_mean, color = is_translated), size = 1) + + ylim(c(0, NA)) + + labs(title="Ungrouped reviews", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + +by_edition <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(edition_language, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_edition) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Edition language", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(edition_language ~ .) + +by_original <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(original_language, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_original) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Original language", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(original_language ~ .) + +by_genre <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), !grepl('Non', book_genre)) %>% + mutate(book_genre = ifelse(grepl('Literary', book_genre), "Literary fiction", "Popular fiction")) %>% + group_by(book_genre, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_genre) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Book genre", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(book_genre ~ .) + +levels(by_genre$is_translated) + +numbers <- data %>% filter(!is.na(mentions_translation), !is.na(is_translated)) %>% + group_by(mentions_translation, is_translated) %>% + tally() + +from_english <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), grepl('English', original_language)) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=from_english) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Books originally published in English", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + +to_english <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), grepl('English', edition_language)) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=to_english) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Books published in English", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") diff --git a/model/sentiment_colllocations.R b/model/sentiment_colllocations.R new file mode 100644 index 0000000..b69b6b3 --- /dev/null +++ b/model/sentiment_colllocations.R @@ -0,0 +1,57 @@ +library(irr) +library(dplyr) +library(ggplot2) +library(reshape2) + +english <- read.csv("../sentiment/English_ratings.csv") +dutch <- read.csv("../sentiment/Dutch_ratings.csv") +french <- read.csv("../sentiment/French_ratings.csv") +german <- read.csv("../sentiment/German_ratings.csv") +portuguese <- read.csv('../sentiment/Portuguese_ratings.csv') +spanish <- read.csv('../sentiment/Spanish_ratings.csv') + +kappam.fleiss(english[,2:6], exact=TRUE) +kappam.fleiss(dutch[,2:3], exact=TRUE) +kappam.fleiss(french[,2:3], exact=TRUE) +kappam.fleiss(german[,2:3], exact=TRUE) +kappam.fleiss(portuguese[,2:3], exact=TRUE) +kappam.fleiss(spanish[,2:3], exact=TRUE) + +reviews_phn <- read.csv("../sentiment/reviews_PHN.csv") + + +by_genre <- reviews_phn %>% filter(!grepl('Non', book_genre)) %>% + mutate(book_genre = ifelse(grepl('Literary', book_genre), "Literary fiction", "Popular fiction")) %>% + group_by(book_genre, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedGenre <- melt(by_genre, id=c('book_genre', 'is_translated'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedGenre, aes(x=book_genre,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Book genre", x = "Genre", y = "Average count of positive, negative and hedge terms per review", fill="Term type") + + facet_grid(is_translated ~ .) + +directions <- reviews_phn %>% filter(is_translated==1) %>% + mutate(direction = ifelse( + edition_language=='English', 'nEnE', 'nE>nE'))) %>% + group_by(direction) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedDirections <- melt(directions, id=c('direction'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedDirections, aes(x=direction,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Translation direction", x = "Direction", y = "Average count of positive, negative and hedge terms per review", color = "Translated") + +originals <- reviews_phn %>% filter(is_translated==0) %>% + mutate(edition_language = ifelse(original_language=='English', 'E', 'nE')) %>% + group_by(edition_language) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedOriginals <- melt(originals, id=c('edition_language'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedOriginals, aes(x=edition_language,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Originals", x = "Language", y = "Average count of positive, negative and hedge terms per review", color = "Translated") \ No newline at end of file diff --git a/model/stats.R b/model/stats.R new file mode 100644 index 0000000..5054f50 --- /dev/null +++ b/model/stats.R @@ -0,0 +1,23 @@ +library(dplyr) + +input_path = "../data/goodreads_review_data.csv" +data = read.csv(input_path) + +data_words <- data %>% filter(cleaned_words>0) + +orig <- data_words %>% group_by(original_language) +orig_reviews <- orig %>% tally() +orig_words <- orig %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + + +edition <- data_words %>% group_by(edition_language) +edit_reviews <- edition %>% tally() +edit_words <- edition %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + +review <- data_words %>% group_by(language) +review_reviews <- review %>% tally() +review_words <- review %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + +genre <- data_words %>% group_by(book_genre) +genre_reviews <- genre %>% tally() +genre_words <- genre %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) diff --git a/model/term_level.R b/model/term_level.R new file mode 100644 index 0000000..6969c4b --- /dev/null +++ b/model/term_level.R @@ -0,0 +1,141 @@ +library(lme4) +library(ggplot2) +library(reshape2) + +# import data + +input_path = "../data/goodreads_formatted.csv" +data = read.csv(input_path) + +# PREPROCESSING + +#simplified genre definition (just literary fiction and popular fiction) + +simple_genre = function(genre) { + if (genre == "Literary fiction") { + return(genre) + } + + if (startsWith(genre, "Popular fiction")) { + return("Popular fiction") + } + + return (NA) +} + + +simple_genres = as.factor(sapply(as.character(data[, "book_genre"]), simple_genre)) +data$book_simple_genre = simple_genres + + +#format with melted term types - useful for testing effect on which terms are used more + +melted_term_data = melt(data, + id.vars = c("original_language", "edition_language", "language", "age_category", "book_genre", "book_simple_genre", "rating_no"), + measure.vars = c("positive", "negative", "hedge"), + variable.name = "term_type", value.name = "count") + + +# correct for main effect of term type and rating +model_term = lm(count ~ term_type + rating_no + term_type * rating_no, data = melted_term_data, na.action = na.exclude) +melted_term_data$corrected_count = residuals(model_term) + +# LINEAR MODELS + + +#terms and rating + +model_rating = lm(rating_no ~ negative + positive + hedge, data = data) +summary(model_rating) + +#effect of (simplified) genre on rating + +model_rating_genre = lm(rating_no ~ book_simple_genre, data = data) +summary(model_rating_genre) + +#interaction effect between simple genre and term frequency + +model_genre_term = lm(corrected_count ~ book_simple_genre * term_type, data = melted_term_data) +summary(model_genre_term) + +# PLOTS + +# plot terms vs rating + +values_per_rating = function(rating, value, data) { + fdata = subset(data, rating_no == rating) + fdata[, value] +} + +ratings = 1:5 + +data_per_rating = data.frame( + rating = ratings, + hedge_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "hedge", data))}), + pos_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "positive", data))}), + neg_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "negative", data))}) +) + +p = ggplot(data = data_per_rating) + + geom_line(aes(rating, neg_mean, color = "negative"), size=1) + + geom_line(aes(rating, pos_mean, color = "positive"), size=1) + + geom_line(aes(rating, hedge_mean, color = "hedge"), size=1) + + ylab("average frequency") + + labs(color = "term type") + + scale_colour_manual(values= c( + "positive" = "#00cc66", + "hedge" = "#3399ff", + "negative" = "#ff3333" + )) + +p + +# genre and term + +genres = levels(simple_genres) + +mean_per_genre_and_term = function(genre, term) { + genre_data = subset(melted_term_data, melted_term_data$book_simple_genre == genre & melted_term_data$term_type == term) + mean(genre_data$corrected_count, na.rm = TRUE) +} + + +results_per_term = function(term) { + values = sapply(genres, function(g) {mean_per_genre_and_term(g, term)}) + + values +} + +results_per_genre = data.frame( + genre = genres, + positive = results_per_term("positive"), + negative = results_per_term("negative"), + hedge = results_per_term("hedge") +) + + +melted_results_per_genre = melt(results_per_genre, id=c("genre")) + +p = ggplot(data = melted_results_per_genre) + + geom_line(aes(x = variable, y = value, group = genre, color = genre), size =1) + + theme(legend.position ="top") + + labs(y="frequency", x="term type") + + +p + +# plot rating per genre + + + +rating_per_genre = function(genre) { + genre_data = subset(data, data$book_simple_genre == genre) + df = data.frame(table(genre_data$rating_no)) + names(df) = c("rating", "frequency") + df$genre = rep(genre, nrow(df)) + + df +} + +rbind(rating_per_genre(genres[1]), rating_per_genre(genres[2])) + diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/tokenise_data.py b/preprocessing/tokenise_data.py new file mode 100644 index 0000000..7574804 --- /dev/null +++ b/preprocessing/tokenise_data.py @@ -0,0 +1,42 @@ +import csv +from preprocessing.tokeniser import Tokeniser +from tqdm import tqdm + +REVIEWS_FILE = './data/goodreads.csv' + +with open(REVIEWS_FILE) as infile: + outpath = REVIEWS_FILE[:-4] + '_tokenised.csv' + with open(outpath, 'w') as outfile: + reader = csv.DictReader(infile) + fieldnames_in = reader.fieldnames + + fieldnames_out = fieldnames_in + ['tokenised_text'] + writer = csv.DictWriter(outfile, fieldnames_out) + writer.writeheader() + + tokenisers = {} + available_languages = Tokeniser.available_languages() + + for row in tqdm(reader): + #check language and initialise tokeniser if needed + language = row['language'].lower() + if language in tokenisers: + t = tokenisers[language] + elif language in available_languages: + t = Tokeniser(language) + tokenisers[language] = t + else: + t = None + + #if there is a tokeniser... + if t: + #process the review + text = row['text'] + tokens = t.process(text) + tokenised_text = ' '.join(tokens) + row['tokenised_text'] = tokenised_text + else: + row['tokenised_text'] = '' + + #write + writer.writerow(row) diff --git a/preprocessing/tokeniser.py b/preprocessing/tokeniser.py new file mode 100644 index 0000000..a6fd752 --- /dev/null +++ b/preprocessing/tokeniser.py @@ -0,0 +1,52 @@ +import spacy + +class Tokeniser: + def models(): + models = { + "english" : "en_core_web_sm", + "dutch" : "nl_core_news_sm", + "french" : "fr_core_news_sm", + "german" : "de_core_news_sm", + "italian" : "it_core_news_sm", + "portuguese" : "pt_core_news_sm", + "spanish" : "es_core_news_sm" + } + return models + + def available_languages(): + return set(Tokeniser.models().keys()) + + def __init__(self, language): + models = Tokeniser.models() + self.nlp = spacy.load(models[language]) + + def process(self, review: str, lemmatise = True, filter_stopwords = True, filter_ne = True): + doc = self.nlp(review) + + # filter punctuation and digits + is_alpha = lambda token : token.is_alpha + filtered_tokens = [token for token in doc if token.is_alpha] + + # filter named entities + # make an exception for language and nationality names + accepted_ent_types = ['', 'LANGUAGE', 'NORP'] + is_not_NE = lambda token: token.ent_type_ in accepted_ent_types + + # filter stopwords + is_not_stopword = lambda token: token.is_stop == False + + # apply all filters + filters = [is_alpha] + if filter_ne: + filters.append(is_not_NE) + if filter_stopwords: + filters.append(is_not_stopword) + filtered_tokens = [token for token in doc if all(f(token) for f in filters)] + + # convert tokens to lemmas or text + if lemmatise: + words = [token.lemma_.lower() for token in filtered_tokens] + else: + words = [token.text.lower() for token in filtered_tokens] + + return words diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6cc3e72 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,89 @@ +appnope==0.1.2 +backcall==0.1.0 +blis==0.7.4 +boto==2.49.0 +boto3==1.9.101 +botocore==1.12.101 +bz2file==0.98 +catalogue==2.0.1 +certifi==2018.11.29 +chardet==3.0.4 +click==7.1.2 +contextvars==2.4 +cymem==2.0.5 +dataclasses==0.8 +decorator==4.3.2 +docopt==0.6.2 +docutils==0.14 +elasticsearch==7.7.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl +en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl +et-xmlfile==1.0.1 +filelock==3.0.12 +ftfy==5.8 +gensim==3.7.1 +idna==2.8 +immutables==0.15 +importlib-metadata==3.3.0 +ipython==6.5.0 +ipython-genutils==0.2.0 +jedi==0.13.3 +Jinja2==2.11.3 +jmespath==0.9.4 +joblib==1.0.1 +MarkupSafe==1.1.1 +murmurhash==1.0.5 +nltk==3.4 +numpy==1.16.1 +openpyxl==3.0.7 +packaging==20.9 +pandas==1.1.5 +parso==0.3.4 +pathy==0.3.6 +pep517==0.10.0 +pexpect==4.6.0 +pickleshare==0.7.5 +pip-tools==6.0.1 +plac==1.1.3 +preshed==3.0.5 +prompt-toolkit==1.0.15 +ptyprocess==0.6.0 +pydantic==1.7.3 +Pygments==2.3.1 +pykwalify==1.7.0 +pyparsing==2.4.7 +python-dateutil==2.8.0 +pytz==2021.1 +regex==2020.11.13 +requests==2.21.0 +ruamel.yaml==0.15.100 +s3transfer==0.2.0 +sacremoses==0.0.43 +scikit-learn==0.24.1 +scipy==1.2.1 +simplegeneric==0.8.1 +singledispatch==3.4.0.3 +six==1.12.0 +sklearn==0.0 +smart-open==1.8.0 +spacy==3.0.1 +spacy-alignments==0.7.2 +spacy-legacy==3.0.1 +spacy-transformers==1.0.1 +srsly==2.4.0 +thinc==8.0.1 +threadpoolctl==2.1.0 +tokenizers==0.9.4 +toml==0.10.2 +torch==1.7.1 +torchcontrib==0.0.2 +tqdm==4.54.1 +traitlets==4.3.2 +transformers==4.2.2 +typer==0.3.2 +typing==3.6.6 +typing-extensions==3.7.4.3 +urllib3==1.24.1 +wasabi==0.8.2 +wcwidth==0.1.7 +zipp==3.4.0 \ No newline at end of file diff --git a/sentiment/__init__.py b/sentiment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sentiment/sentiment_analysis.py b/sentiment/sentiment_analysis.py new file mode 100644 index 0000000..e0b952f --- /dev/null +++ b/sentiment/sentiment_analysis.py @@ -0,0 +1,60 @@ +from transformers import pipeline +import pandas as pd +from os.path import isfile + +languages = ['English', 'Dutch', 'German', 'French', 'Italian', 'Spanish'] +selected_columns = ['id', 'language', 'rating_no', 'sentiment'] +out_csv = 'reviews_sentiment_text.csv' + +def analyze_sentiment(review_file): + reviews = pd.read_csv(review_file) + start_from = None + data = reviews[(reviews['text'].notna()) & (reviews['language'].isin(languages))].sample(10000, random_state=21) + data['sentiment'] = data.apply(lambda x: sentiment_classification(x['rating_no']), axis=1) + classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment") + if isfile(out_csv): + done_records = pd.read_csv(out_csv) + start_from = len(done_records.index) + for i, row in data.iterrows(): + write_header = True if i==0 else False + output = pd.DataFrame.from_dict({key: [row[key]] for key in selected_columns}) + if start_from: + if i < start_from: + continue + try: + analysis = classifier(row['tokenised_text'][:512]) + output['prediction'] = int(analysis[0]['label'][:1]) + except: + output['prediction'] = None + + output.to_csv(out_csv, mode='a', header=write_header, index=False) + +def sentiment_classification(rating_no): + if rating_no >= 4: + sentiment = 'P' + elif rating_no <= 2: + sentiment = 'N' + else: + sentiment = '-' + return sentiment + +def calculate_accuracy(sentiment_file=out_csv): + data = pd.read_csv(sentiment_file) + data['pred_sentiment'] = data.apply(lambda x: sentiment_classification(x['prediction']), axis=1) + data['diff'] = abs(data['rating_no']-data['prediction']) + print('percentage exact label correct: ', len(data[data['rating_no']==data['prediction']])/len(data)) + print('percentage sentiment label correct: ', len(data[data['sentiment']==data['pred_sentiment']])/len(data)) + print('one off accuracy: ', len(data[data['diff']<2])/len(data)) + +""" +exact: 0.34 +correct category: 0.53 +one-off: 0.64 +""" + +""" +Full text scores (sample of 10000 reviews) +percentage exact label correct: 0.3316 +percentage sentiment label correct: 0.5308 +one off accuracy: 0.6458 +""" \ No newline at end of file diff --git a/sentiment/sentiment_classification.py b/sentiment/sentiment_classification.py new file mode 100644 index 0000000..e8ea788 --- /dev/null +++ b/sentiment/sentiment_classification.py @@ -0,0 +1,37 @@ +import openpyxl +import pandas as pd + +LANGUAGES = ['English', 'Dutch', 'French', + 'German', 'Italian', 'Portuguese', 'Spanish'] + + +def create_dataframes(infile): + wb = openpyxl.load_workbook(filename=infile) + sheet_names = wb.sheetnames + for lang in LANGUAGES: + # find all sheets of the language, but don't use the non-annotated ones + lang_sheets = [s for s in sheet_names if s.startswith( + lang) and len(s) > len(lang)] + out_df = pd.DataFrame() + for i, key in enumerate(lang_sheets): + sheet = wb[key] + values = sheet.values + df = pd.DataFrame(values, columns=next(values)).head(100) + if i == 0: + out_df['word'] = df['Word'] + out_df[key] = df.apply( + lambda x: sentiment_classification(x['Category']), axis=1) + out_df.to_csv('{}_ratings.csv'.format(lang), index=False) + + +def sentiment_classification(label): + if not label: + return None + elif label.lower().startswith('p'): + return 'P' + elif label.lower().startswith('n'): + return 'N' + elif label.lower().startswith('h') or label.lower().startswith('c'): + return 'H' + else: + return None diff --git a/sentiment/sentiment_collocations.py b/sentiment/sentiment_collocations.py new file mode 100644 index 0000000..d8d34aa --- /dev/null +++ b/sentiment/sentiment_collocations.py @@ -0,0 +1,63 @@ +import pandas as pd +from collections import Counter +import re +from os.path import isfile +from os import remove + +LANGUAGES_PATTERNS = { + 'dutch': r'^vertaa?l', + 'english': r'^translat', + 'french': r'^tradu', + 'german': r'[uü]bersetz', + 'italian': r'^tradu', + 'portuguese': r'^tradu', + 'spanish': r'^tradu', +} + +LANGUAGES = ['English', 'Dutch', 'German', 'French', 'Spanish'] +SENTIMENTS_FILE = 'collocations_sentiments.csv' +WINDOW_SIZE = 4 +INPUT_FACTORS = ['id', 'original_language', 'edition_language', 'book_title', 'language', + 'age_category', 'book_genre', 'rating_no', 'is_translated', 'mention_count'] +OUTPUT_FILE = 'reviews_PHN.csv' + + +def create_lemma_valence_list(): + output_list = [] + for lang in LANGUAGES: + df = pd.read_csv('{}_ratings.csv'.format(lang), dtype='category') + for i, row in df.iterrows(): + cats = Counter(row[1:]).most_common(1) + # if at least two annotators agree and the category is not NaN + if cats[0][1] >= 2 and isinstance(cats[0][0], str): + output_list.append( + {'word': row['word'], 'language': lang, 'category': cats[0][0]}) + output_df = pd.DataFrame(output_list) + output_df.to_csv(SENTIMENTS_FILE, index=False) + + +def count_sentiments(reviews_file): + reviews = pd.read_csv(reviews_file) + sentiments = pd.read_csv(SENTIMENTS_FILE) + if isfile(OUTPUT_FILE): + # remove earlier file, since we write out in append mode + remove(OUTPUT_FILE) + write_header = True + for i, row in reviews.iterrows(): + if row['mentions_translation'] and row['language'] in LANGUAGES: + words = row['tokenised_text'].split(" ") + pattern = LANGUAGES_PATTERNS[row['language'].lower()] + data = {factor: row[factor] for factor in INPUT_FACTORS} + data.update({'P': 0, 'H': 0, 'N': 0}) + relevant_sentiments = sentiments[sentiments['language'] + == row['language']] + for k, word in enumerate(words): + if re.search(pattern, word): + relevant_words = [words[j] for j in range( + k - WINDOW_SIZE, k + WINDOW_SIZE + 1) if 0 <= j < len(words)] + for m, sen in relevant_sentiments.iterrows(): + if sen['word'] in relevant_words: + data[sen['category']] += 1 + output = pd.DataFrame(data, index=[i]) + output.to_csv(OUTPUT_FILE, mode='a', header=write_header) + write_header = False