diff --git a/.gitignore b/.gitignore index eb62ac7..33d89ef 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ /XML /CSV *.csv +/model/.R* +/embeddings/embeddings +/embeddings/.env +/embeddings/eval/questions-words.txt +/data/ +/sentiment/*.csv \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5e320a5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 UU Centre for Digital Humanities - Research Software Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 02ca6b4..e0f612d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,30 @@ -# Reader-responses-to-translated-literature +# Reader responses to translated literature -This repo contains the work we do in the Research-IT project with Haidee Kotze and Gys-Walt van Egdom. +This repository contains code for the [DIOPTRA-L](https://cdh.uu.nl/portfolio/digital-opinions-on-translated-literature-dioptra-l-2/) project by Haidee Kotze, Gys-Walt van Egdom, Corina Koolen and Utrecht University's Research Software Lab, and can be used to reproduce the publication +Kotze, Haidee & Janssen, Berit & Koolen, Corina & Plas, Luka & Egdom, Gys-Walt. (2021). Norms, affect and evaluation in the reception of literary translations in multilingual online reading communities: Deriving cognitive-evaluative templates from big data. Translation, Cognition & Behavior. 4. 10.1075/tcb.00060.kot. + +## Prerequisites +### Python +Most of the scripts require Python 3.6. To install dependencies, run +`pip install -r requirements.txt` + +### R +The statistical analysis and visualization was performed in R, using the following libraries: +- coin +- dplyr +- ggplot2 +- Hmisc +- irr +- lme4 +- reshape2 +- rstatix + +## Steps to reproduce +1. scrapers: Python scripts used to scrape reviews from Goodreads. Documentation on usage in that folder's README. +2. preprocessing: Python scripts used to clean the data, and more specifically, tokenization. +3. embeddings: Jupyter notebooks for training and evaluating word embeddings using word2vec. As the dataset is relatively small, the resulting embeddings were not informative for further research. +4. analysis: Python scripts to collect and count translation lemmas, based on human annotations. +5. collocations: Python scripts for finding collocations surrounding translation lemmas +6. sentiment: Python scripts to count positive / negative and hedge terms in collocations. +7. model: R scripts used to generate statistics and visualizations of the data. -For now, it contains a GoodReads scraper, in the 'scrapers' folder. See the README over there if you want to do some scraping. diff --git a/analysis/__init__.py b/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis/count_data.py b/analysis/count_data.py new file mode 100644 index 0000000..75df477 --- /dev/null +++ b/analysis/count_data.py @@ -0,0 +1,130 @@ +import csv +import re +import pandas as pd +from typing import List, Dict +from collections import defaultdict +from collocations.patterns import LANGUAGES_PATTERNS + +WORDS_PATH = './data/word_classes.csv' +INPUT_FACTORS = ['original_language', 'edition_language', + 'book_title', 'language', 'age_category', 'book_genre', 'rating_no'] +WINDOW_SIZE = 4 + + +def read_categories(words_path): + with open(WORDS_PATH) as words_file: + reader = csv.DictReader(words_file) + word_categories = {language: defaultdict( + lambda: 'other') for language in LANGUAGES_PATTERNS} + + for row in reader: + language = row['language'] + word = row['word'] + cat = row['category'] + word_categories[language][word] = cat + + categories_generator = ( + value for lang_data in word_categories.values() for value in lang_data.values()) + categories = list(sorted(set(categories_generator))) + + return word_categories, categories + + +def output_count(words: List[str], word_categories: Dict, categories: List): + counts = {cat: 0 for cat in categories} + + for word in words: + cat = word_categories[word] + counts[cat] += 1 + + return counts + + +def count_data(reviews_path, language='english'): + '''Create a table with one row for each mention of "translation". Includes + some info about the review and the categories of words in the context window.''' + # import word categories + word_categories, categories = read_categories(WORDS_PATH) + + # import reviews + with open(reviews_path) as reviews_file: + reader = csv.DictReader(reviews_file) + + all_data = [] + + for row in reader: + text = row['tokenised_text'] + language = row['language'].lower() + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + input_data = {factor: row[factor] for factor in INPUT_FACTORS} + + for i, word in enumerate(words): + if re.search(pattern, word): + preceding = [words[j] + for j in range(i - WINDOW_SIZE, i) if j >= 0] + following = [words[j] for j in range( + i + 1, i + 1 + WINDOW_SIZE) if j < len(words)] + window = preceding + following + output_data = output_count( + window, word_categories[language], categories) + + data = {**input_data, **output_data} + all_data.append(data) + + df = pd.DataFrame(all_data, columns=INPUT_FACTORS + categories) + return df + + +def mentions_translation(text, language): + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + return str(int(any(re.search(pattern, word) for word in words))) + else: + return None + + +def count_translation_mentions(text, language): + if language in LANGUAGES_PATTERNS: + pattern = LANGUAGES_PATTERNS[language] + words = text.split() + return str(sum(1 for word in words if re.search(pattern, word))) + else: + return None + + +def count_data_per_review(reviews_path): + '''Create table with one row for each review. Similar to the + I-Analyzer output, but with some extra info. We also ignore some + columns like the full text.''' + + with open(reviews_path) as reviews_file: + reader = csv.DictReader(reviews_file) + + def review_data(row): + input_data = {factor: row[factor] for factor in INPUT_FACTORS} + is_translated = int(row['original_language'] + != row['edition_language']) + mentions = mentions_translation( + row['tokenised_text'], row['language'].lower()) + mention_count = count_translation_mentions( + row['tokenised_text'], row['language'].lower()) + words = len(row['text'].split()) + cleaned_words = len(row['tokenised_text'].split()) + data = { + **input_data, + 'is_translated': is_translated, + 'mentions_translation': mentions, + 'mention_count': mention_count, + 'words': words, + 'cleaned_words': cleaned_words + } + return data + + all_data = [review_data(row) for row in reader] + + df = pd.DataFrame(all_data, columns=INPUT_FACTORS + + ['is_translated', 'mentions_translation', 'mention_count', 'words', 'cleaned_words']) + return df diff --git a/analysis/write_review_table.py b/analysis/write_review_table.py new file mode 100644 index 0000000..4ed5b36 --- /dev/null +++ b/analysis/write_review_table.py @@ -0,0 +1,10 @@ +# write table with 1 row per review + +import pandas as pd +from analysis.count_data import count_data_per_review + +reviews_path = './data/goodreads_tokenised.csv' +export_path = './data/goodreads_review_data.csv' + +data = count_data_per_review(reviews_path) +data.to_csv(export_path, na_rep='NA') \ No newline at end of file diff --git a/analysis/write_table.py b/analysis/write_table.py new file mode 100644 index 0000000..4047430 --- /dev/null +++ b/analysis/write_table.py @@ -0,0 +1,10 @@ +# write table with 1 row per mention of translation + +import pandas as pd +from analysis.count_data import count_data + +reviews_path = './data/goodreads_tokenised.csv' +export_path = './data/goodreads_formatted.csv' + +data = count_data(reviews_path) +data.to_csv(export_path) \ No newline at end of file diff --git a/collocations/__init__.py b/collocations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/collocations/__main__.py b/collocations/__main__.py new file mode 100644 index 0000000..d356dce --- /dev/null +++ b/collocations/__main__.py @@ -0,0 +1,6 @@ +from collocations.collocations import collocate +from collocations.patterns import LANGUAGES_PATTERNS + +for language, pattern in LANGUAGES_PATTERNS.items(): + if pattern: + collocate(language, pattern) \ No newline at end of file diff --git a/collocations/collocations.py b/collocations/collocations.py new file mode 100644 index 0000000..da7cc2f --- /dev/null +++ b/collocations/collocations.py @@ -0,0 +1,70 @@ +import csv +import re +from math import log + +def collocate(language, target_pattern, reviews_path = './data/goodreads_tokenised.csv', out_name = 'collocations', out_dir = './data'): + + out_path = '{}/{}_{}.txt'.format(out_dir, out_name, language) + + # import reviews + + with open(reviews_path) as csvfile: + reader = csv.DictReader(csvfile) + reviews_text = (row["tokenised_text"] for row in reader if row["language"].lower() == language) + reviews = [review.split() for review in reviews_text] + + # frequencies + + vocab = set(word for review in reviews for word in review) + + def count_words(): + counts_general = {word: 0 for word in vocab} + counts_translat = {word: 0 for word in vocab} + window = 4 + + for review in reviews: + for i, word in enumerate(review): + counts_general[word] += 1 + + if re.search(target_pattern, word): + preceding = [review[j] for j in range(i - window, i) if j >= 0] + following = [review[j] for j in range(i + 1, i + 1 + window) if j < len(review)] + + for neighbour in preceding + following: + counts_translat[neighbour] += 1 + + return counts_translat, counts_general + + def filter_counts(target, general): + filtered_vocab = set(word for word, count in general.items() if count > 1) + filtered_target = {word: count for word, count in target.items() if word in filtered_vocab} + filtered_general = {word: count for word, count in general.items() if word in filtered_vocab} + + return filtered_target, filtered_general + + def relative_counts(counts): + total = sum(counts.values()) + return {word: count / total for word, count in counts.items()} + + def counts_log(counts): + return {word: log(count + 1) for word, count in counts.items()} + + def relative_frequencies(target, general): + target_log = counts_log(relative_counts(target)) + general_log = counts_log(relative_counts(general)) + + return {word: target_log[word] - general_log[word] for word in target} + + counts_translat, counts_general = filter_counts(*count_words()) + rel_freq = relative_frequencies(counts_translat, counts_general) + + def sort_by_frequency(counts): + return sorted(counts, key = lambda w : counts[w], reverse = True) + + ranking = sort_by_frequency(rel_freq) + + #export + with open(out_path, 'w') as outfile: + for i in range(100): + outfile.write(ranking[i]) + outfile.write('\n') diff --git a/collocations/patterns.py b/collocations/patterns.py new file mode 100644 index 0000000..44bbcaa --- /dev/null +++ b/collocations/patterns.py @@ -0,0 +1,9 @@ +LANGUAGES_PATTERNS = { + 'dutch' : r'^vertaa?l', + 'english' : r'^translat', + 'french' : r'^tradu', + 'german' : r'[uü]bersetz', + 'italian' : r'^tradu', + 'portuguese' : r'^tradu', + 'spanish' : r'^tradu', +} \ No newline at end of file diff --git a/collocations/test_patterns.py b/collocations/test_patterns.py new file mode 100644 index 0000000..5511894 --- /dev/null +++ b/collocations/test_patterns.py @@ -0,0 +1,26 @@ +# print the set of words that match the pattern for each language + +import csv +import re +from collocations.patterns import LANGUAGES_PATTERNS + +reviews_path = './data/goodreads_tokenised.csv' + +matching_words = {language: set() for language in LANGUAGES_PATTERNS} + +with open(reviews_path) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + language = row['language'].lower() + words = row['tokenised_text'].split() + + is_match = lambda word: re.search(LANGUAGES_PATTERNS[language], word) + matches = filter(is_match, words) + for match in matches: + matching_words[language].add(match) + +for language in matching_words: + print(language.upper()) + print() + print(matching_words[language]) + print() \ No newline at end of file diff --git a/embeddings/eval/__init__.py b/embeddings/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embeddings/eval/__main__.py b/embeddings/eval/__main__.py new file mode 100644 index 0000000..f23bbec --- /dev/null +++ b/embeddings/eval/__main__.py @@ -0,0 +1,25 @@ +from gensim import models + +EMBEDDINGS_PATH = "embeddings/with_pretraining" + +model = models.Word2Vec.load(EMBEDDINGS_PATH) + +#automatic evaluation +print('QUESTION ANSWER DATASET') +accuracy = model.accuracy("eval/questions-words.txt") + +for section in accuracy: + name = section['section'] + correct = len(section['correct']) + incorrect = len(section['incorrect']) + print('{}: {} out of {}'.format(name, correct, incorrect)) + + +#manual inspection +print() +words = ['book', 'author', 'genre', 'boring', 'recommend', 'translation'] + +print('WORD SIMILARITES') +for word in words: + neighbours = [neighbour for neighbour, score in model.most_similar(word)] + print("'{}' is most similar to '{}', '{}', or '{}'".format(word, neighbours[0], neighbours[1], neighbours[2])) diff --git a/embeddings/readme.md b/embeddings/readme.md new file mode 100644 index 0000000..31b511e --- /dev/null +++ b/embeddings/readme.md @@ -0,0 +1,32 @@ +English embeddings for goodreads reviews. + +Setup for a python 3.6 environment: +```bash +pip install gensim==3.8 +pip install nltk +python -m train.download +``` +The train.download module downloads the Brown corpus for pretraining and a question-answer dataset for evaluation. The reviews have to be downloaded manually from I-analyzer. + +Embeddings can be trained from the command line: +```bash +python -m train +``` +There are no command line options, so edit `train/__main__.py` to configure hyperparameters. + +To evaluate: +```bash +python -m eval +``` +Check that the code in `eval/__main__.py` uses the right embeddings. + + +For the visualisation of results: +Setup with +```bas +pip install jupyter +pip install matplotlib +pip install sklearn +``` + +Open `visualise/plot.ipynb` as a jupyter notebook. \ No newline at end of file diff --git a/embeddings/train/__init__.py b/embeddings/train/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embeddings/train/__main__.py b/embeddings/train/__main__.py new file mode 100644 index 0000000..1a78589 --- /dev/null +++ b/embeddings/train/__main__.py @@ -0,0 +1,43 @@ +import os +from train.corpus_reader import ReviewCorpus, BrownCorpus +from train.train_embeddings import train +from gensim import models + +CORPUS_PATH = "../data/reviews_english.csv" #corpus location +DST_DIR = "embeddings" #directory for embeddings +FILENAME = "with_pretraining" #name of embeddings file + +PRETRAIN = True #pretrain on Brown corpus +SPLIT_SENTS = True #split reviews into sentences +WINDOW = 5 #window size for words + +#initialise corpus +print("Initialising corpus...") +sentences = ReviewCorpus(CORPUS_PATH, split_sentences=SPLIT_SENTS) + +#train embeddings +if PRETRAIN: + pretrained_name = 'pretrained' + if WINDOW != 5: + pretrained_name += '_w{}'.format(WINDOW) + + if not pretrained_name in os.listdir(DST_DIR): + print('No pretrained embeddings found. Pretraining...') + pretrain_sentences = BrownCorpus() + pretrained = train(pretrain_sentences, window=WINDOW) + pretrained.save(os.path.join(DST_DIR, pretrained_name)) + else: + print('Found pretrained embeddings.') + pretrained = models.Word2Vec.load(os.path.join(DST_DIR, pretrained_name)) + + print('Training model...') + model = train(sentences, pretrained = pretrained) +else: + print('Training model...') + model = train(sentences, window=WINDOW) + +#export +out_path = os.path.join(DST_DIR, FILENAME) + +print("Saving model to '{}'...".format(out_path)) +model.save(out_path) \ No newline at end of file diff --git a/embeddings/train/corpus_reader.py b/embeddings/train/corpus_reader.py new file mode 100644 index 0000000..3a68e3d --- /dev/null +++ b/embeddings/train/corpus_reader.py @@ -0,0 +1,55 @@ +import csv +from gensim import utils +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import brown +import re + +def token(word): + if re.match('^(\d)+$', word): + return '' + if not re.search('\w', word): + return '' + return word.lower() + +class ReviewCorpus(): + def __init__(self, path, split_sentences = True): + self.split_sents = split_sentences + with open(path) as csvfile: + reader = csv.DictReader(csvfile) + reviews = [row["text"] for row in reader] + + self.reviews = [self.preprocess(review) for review in reviews] + + def __len__(self): + if self.split_sents: + return sum(len(review) for review in self.reviews) + else: + return len(self.reviews) + + def __iter__(self): + for review in self.reviews: + if self.split_sents: + for sent in review: + yield sent + else: + yield review + + def preprocess(self, review): + def tokens(string): + return [token(word) for word in word_tokenize(string)] + + if self.split_sents: + sents = sent_tokenize(review) + return [tokens(sent) for sent in sents] + else: + return tokens(review) + +class BrownCorpus(): + def __init__(self): + pass + + def __iter__(self): + for sent in brown.sents(): + words = [token(word) for word in sent] + yield words + diff --git a/embeddings/train/download.py b/embeddings/train/download.py new file mode 100644 index 0000000..dbcd16d --- /dev/null +++ b/embeddings/train/download.py @@ -0,0 +1,14 @@ +#download brown corpus + +import nltk + +nltk.download('brown') + +#download question answer dataset + +import requests + +url = 'https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/docs/notebooks/datasets/questions-words.txt' +r = requests.get(url) +with open('eval/questions-words.txt', 'wb') as file: + file.write(r.content) \ No newline at end of file diff --git a/embeddings/train/train_embeddings.py b/embeddings/train/train_embeddings.py new file mode 100644 index 0000000..62d7b1d --- /dev/null +++ b/embeddings/train/train_embeddings.py @@ -0,0 +1,17 @@ +from gensim import models + +def train(sentences, pretrained = None, window = 5): + if pretrained: + #train from new sentences + pretrained.build_vocab(sentences, update=True) + pretrained.train( + sentences, + total_examples=len(sentences), epochs=pretrained.iter + ) + return pretrained + else: + model = models.Word2Vec( + sentences = sentences, + size=100, window=window + ) + return model \ No newline at end of file diff --git a/embeddings/visualise/plot.ipynb b/embeddings/visualise/plot.ipynb new file mode 100644 index 0000000..f1ab7b5 --- /dev/null +++ b/embeddings/visualise/plot.ipynb @@ -0,0 +1,156 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "Python 3.6.9 64-bit ('.env': venv)", + "display_name": "Python 3.6.9 64-bit ('.env': venv)", + "metadata": { + "interpreter": { + "hash": "a9fae5ed66804ddaf38242a1dfb5371fa275897ea3faaaa95a90d47a5a4c7d75" + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from gensim import models\n", + "from sklearn.manifold import TSNE\n", + "\n", + "EMBEDDINGS_PATH = \"../embeddings/with_pretrained_per_review_w15\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model = models.Word2Vec.load(EMBEDDINGS_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def neighbours(word, n=10):\n", + " return [w for w, score in model.wv.most_similar(word, topn=n)]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def reduce_dimensions(words, vectors):\n", + " tsne = TSNE(n_components = 2)\n", + " reduced = tsne.fit_transform(vectors)\n", + " x_values = [row[0] for row in reduced]\n", + " y_values = [row[1] for row in reduced]\n", + " return x_values, y_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "target ='translation'\n", + "nwords = 15\n", + "words = [target] + neighbours(target, nwords)\n", + "w2i = {word: index for index, word in enumerate(words)}\n", + "vectors = [model.wv[word] for word in words]\n", + "x_values, y_values = reduce_dimensions(words, vectors)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 6 + }, + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n\n \n \n \n \n 2020-11-11T10:42:47.240049\n image/svg+xml\n \n \n Matplotlib v3.3.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "for i, word in enumerate(words):\n", + " plt.annotate(word, (x_values[i], y_values[i]))\n", + "\n", + "plt.scatter(x_values, y_values)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "POSITIVE: wonderful, fantastic, fun, brilliant, perfect, fascinating, amazing, nice, lovely, full\n\nNEGATIVE: repetitive, dull, predictable, poorly, unusual, strangely, sparse, abrupt, intriguing, anticlimactic\n" + ] + } + ], + "source": [ + "def most_similar(positive, negative, n=10):\n", + " return [w for w, score in model.wv.most_similar(positive=positive, negative=negative, topn=n)]\n", + "\n", + "pos_words = most_similar(['good', 'great'], ['bad'])\n", + "neg_words = most_similar(['boring', 'stilted'], ['good'])\n", + "\n", + "print('POSITIVE:', ', '.join(pos_words))\n", + "print()\n", + "print('NEGATIVE:', ', '.join(neg_words))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/model/review-level.R b/model/review-level.R new file mode 100644 index 0000000..145527b --- /dev/null +++ b/model/review-level.R @@ -0,0 +1,263 @@ +library(lme4) +library(ggplot2) +library(reshape2) +library(dplyr) +library(rstatix) # requires "coin" library, too +library(Hmisc) + +# import data + +input_path = "../data/goodreads_review_data.csv" +data = read.csv(input_path) + +#### test if translated books have different star rating than non-translated +describe(subset(data, is_translated==1)$rating_no) +# mean 3.786, poprortion 1 0.058 2 0.101 3 0.192 4 0.296 5 0.354 +describe(subset(data, is_translated==0)$rating_no) +# mean 3.85 proportion 1 0.059 2 0.091 3 0.175 4 0.293 5 0.383 + + +wilcox.test(subset(data, is_translated==1)$rating_no, + subset(data, is_translated==0)$rating_no, + alternative = "two.sided") +# significant difference + +data %>% wilcox_effsize(rating_no ~ is_translated) +# but small effect size + +# filters + +min_review_length = function(data, min_length = 1) { + subset(data, data$words >= min_length) +} + +only_translated = function(data) { + subset(data, data$is_translated) +} + +# counting functions + +count_translation_mentions = function(data, absolute = FALSE) { + values = subset(data$mentions_translation, !is.na(data$mentions_translation)) + if (absolute) { + return(sum(values)) + } + else { + return(sum(values)/length(values)) + } +} + +translation_frequency = function(data) { + all_data = subset(data, data$words >= 10) + mention_data = subset(all_data, as.logical(all_data$mentions_translation)) + total_mentions = sum(mention_data$mention_count) #to do: add column with count instead of bool + total_words = sum(all_data$words) + + total_mentions / total_words +} + +count_reviews = function(data) { + nrow(data) +} + +count_titles = function(data) { + length(unique(data$book_title)) +} + +#quick overview of how many reviews are from translated books and how many mention translation + +table(data[,c("mentions_translation", "is_translated")]) + +# full overview of data per language + +original_languages = unique(data$original_language[as.character(data$original_language) != ""]) +edition_languages = unique(data$edition_language[as.character(data$edition_language) != ""]) + +full_table = function(data) { + res = data.frame() + + for (og_lang in original_languages) { + for (ed_lang in edition_languages) { + subdata = subset(data, + data$original_language == og_lang & data$edition_language == ed_lang) + new_row = data.frame(original_language = og_lang, + edition_language = ed_lang, + is_translated = as.character(og_lang) != as.character(ed_lang), + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + } + + return(res) +} + +oglang_table = function(data) { + res = data.frame() + + for (og_lang in original_languages) { + subdata = subset(data, + data$original_language == og_lang & as.character(data$edition_language) != as.character(data$original_language)) + new_row = data.frame(original_language = og_lang, + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + + return(res) +} + +edlang_table = function(data) { + res = data.frame() + + for (ed_lang in edition_languages) { + subdata = subset(data, + data$edition_language == ed_lang & as.character(data$edition_language) != as.character(data$original_language)) + new_row = data.frame(edition_language = ed_lang, + n_titles = count_titles(subdata), + n_reviews = count_reviews(subdata), + n_mention_translation = count_translation_mentions(subdata, absolute=TRUE), + p_mention_translation = count_translation_mentions(subdata) + ) + res = rbind(res, new_row) + } + + return(res) +} + +editions <- edlang_table(table) + + +# rating vs mentioning of translation + + +ratings = 1:5 + +rating_data = rbind( + data.frame( + is_translated = rep("translated", length(ratings)), + rating = ratings, + n_reviews = sapply(ratings, + function(r) { + nrow(subset(data, data$rating_no == r & data$is_translated)) + }), + translation_freq = sapply(ratings, + function (r) { + translation_frequency(subset(data, data$rating_no == r & data$is_translated)) + }) + ), + data.frame( + is_translated = rep("not translated", length(ratings)), + rating = ratings, + n_reviews = sapply(ratings, + function(r) { + nrow(subset(data, data$rating_no == r & ! data$is_translated)) + }), + translation_freq = sapply(ratings, + function (r) { + translation_frequency(subset(data, data$rating_no == r & ! data$is_translated)) + }) + ) +) +rating_data + +#plot + +ggplot(data = rating_data) + + geom_line(aes(x = rating, y = translation_freq, color = is_translated), size = 1) + + ylim(c(0, NA)) + + labs(x = "rating", y = "frequency of 'translation'", color = "edition") + +#model + +mention_model = glm(mentions_translation ~ rating_no * is_translated, data, family = binomial) +summary(mention_model) + +rating_data_test <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data = rating_data_test) + + geom_line(aes(x = rating_no, y = mention_count_mean, color = is_translated), size = 1) + + ylim(c(0, NA)) + + labs(title="Ungrouped reviews", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + +by_edition <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(edition_language, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_edition) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Edition language", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(edition_language ~ .) + +by_original <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), words>10) %>% + group_by(original_language, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_original) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Original language", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(original_language ~ .) + +by_genre <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), !grepl('Non', book_genre)) %>% + mutate(book_genre = ifelse(grepl('Literary', book_genre), "Literary fiction", "Popular fiction")) %>% + group_by(book_genre, rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=by_genre) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Book genre", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + + facet_grid(book_genre ~ .) + +levels(by_genre$is_translated) + +numbers <- data %>% filter(!is.na(mentions_translation), !is.na(is_translated)) %>% + group_by(mentions_translation, is_translated) %>% + tally() + +from_english <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), grepl('English', original_language)) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=from_english) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Books originally published in English", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") + +to_english <- data %>% filter(!is.na(mentions_translation), !is.na(rating_no), grepl('English', edition_language)) %>% + group_by(rating_no, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + add_tally() %>% + summarise_at(.vars=c('n', 'mention_count'), .funs=c(mean="mean",sum="sum")) %>% + select(-c('n_sum')) %>% rename(n=n_mean) + +ggplot(data=to_english) + + geom_line(aes(x=rating_no, y=mention_count_mean, color=is_translated), size=1) + + ylim(c(0, NA)) + + labs(title="Books published in English", x = "Goodreads rating", y = "Average count of translation lemma per review", color = "Translated") diff --git a/model/sentiment_colllocations.R b/model/sentiment_colllocations.R new file mode 100644 index 0000000..b69b6b3 --- /dev/null +++ b/model/sentiment_colllocations.R @@ -0,0 +1,57 @@ +library(irr) +library(dplyr) +library(ggplot2) +library(reshape2) + +english <- read.csv("../sentiment/English_ratings.csv") +dutch <- read.csv("../sentiment/Dutch_ratings.csv") +french <- read.csv("../sentiment/French_ratings.csv") +german <- read.csv("../sentiment/German_ratings.csv") +portuguese <- read.csv('../sentiment/Portuguese_ratings.csv') +spanish <- read.csv('../sentiment/Spanish_ratings.csv') + +kappam.fleiss(english[,2:6], exact=TRUE) +kappam.fleiss(dutch[,2:3], exact=TRUE) +kappam.fleiss(french[,2:3], exact=TRUE) +kappam.fleiss(german[,2:3], exact=TRUE) +kappam.fleiss(portuguese[,2:3], exact=TRUE) +kappam.fleiss(spanish[,2:3], exact=TRUE) + +reviews_phn <- read.csv("../sentiment/reviews_PHN.csv") + + +by_genre <- reviews_phn %>% filter(!grepl('Non', book_genre)) %>% + mutate(book_genre = ifelse(grepl('Literary', book_genre), "Literary fiction", "Popular fiction")) %>% + group_by(book_genre, is_translated) %>% + mutate(is_translated = ifelse(is_translated == 0, "not translated", "translated")) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedGenre <- melt(by_genre, id=c('book_genre', 'is_translated'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedGenre, aes(x=book_genre,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Book genre", x = "Genre", y = "Average count of positive, negative and hedge terms per review", fill="Term type") + + facet_grid(is_translated ~ .) + +directions <- reviews_phn %>% filter(is_translated==1) %>% + mutate(direction = ifelse( + edition_language=='English', 'nEnE', 'nE>nE'))) %>% + group_by(direction) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedDirections <- melt(directions, id=c('direction'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedDirections, aes(x=direction,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Translation direction", x = "Direction", y = "Average count of positive, negative and hedge terms per review", color = "Translated") + +originals <- reviews_phn %>% filter(is_translated==0) %>% + mutate(edition_language = ifelse(original_language=='English', 'E', 'nE')) %>% + group_by(edition_language) %>% + summarise_at(.vars=vars(P, H, N), .funs=c(mean="mean",sum="sum")) + +meltedOriginals <- melt(originals, id=c('edition_language'), measure=c('P_mean', 'H_mean', 'N_mean')) + +ggplot(meltedOriginals, aes(x=edition_language,y=value)) + + geom_col(aes(fill=variable), position = 'dodge') + + labs(title="Originals", x = "Language", y = "Average count of positive, negative and hedge terms per review", color = "Translated") \ No newline at end of file diff --git a/model/stats.R b/model/stats.R new file mode 100644 index 0000000..5054f50 --- /dev/null +++ b/model/stats.R @@ -0,0 +1,23 @@ +library(dplyr) + +input_path = "../data/goodreads_review_data.csv" +data = read.csv(input_path) + +data_words <- data %>% filter(cleaned_words>0) + +orig <- data_words %>% group_by(original_language) +orig_reviews <- orig %>% tally() +orig_words <- orig %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + + +edition <- data_words %>% group_by(edition_language) +edit_reviews <- edition %>% tally() +edit_words <- edition %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + +review <- data_words %>% group_by(language) +review_reviews <- review %>% tally() +review_words <- review %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) + +genre <- data_words %>% group_by(book_genre) +genre_reviews <- genre %>% tally() +genre_words <- genre %>% summarise_at(.vars=vars(words), .funs=c(sum="sum")) diff --git a/model/term_level.R b/model/term_level.R new file mode 100644 index 0000000..6969c4b --- /dev/null +++ b/model/term_level.R @@ -0,0 +1,141 @@ +library(lme4) +library(ggplot2) +library(reshape2) + +# import data + +input_path = "../data/goodreads_formatted.csv" +data = read.csv(input_path) + +# PREPROCESSING + +#simplified genre definition (just literary fiction and popular fiction) + +simple_genre = function(genre) { + if (genre == "Literary fiction") { + return(genre) + } + + if (startsWith(genre, "Popular fiction")) { + return("Popular fiction") + } + + return (NA) +} + + +simple_genres = as.factor(sapply(as.character(data[, "book_genre"]), simple_genre)) +data$book_simple_genre = simple_genres + + +#format with melted term types - useful for testing effect on which terms are used more + +melted_term_data = melt(data, + id.vars = c("original_language", "edition_language", "language", "age_category", "book_genre", "book_simple_genre", "rating_no"), + measure.vars = c("positive", "negative", "hedge"), + variable.name = "term_type", value.name = "count") + + +# correct for main effect of term type and rating +model_term = lm(count ~ term_type + rating_no + term_type * rating_no, data = melted_term_data, na.action = na.exclude) +melted_term_data$corrected_count = residuals(model_term) + +# LINEAR MODELS + + +#terms and rating + +model_rating = lm(rating_no ~ negative + positive + hedge, data = data) +summary(model_rating) + +#effect of (simplified) genre on rating + +model_rating_genre = lm(rating_no ~ book_simple_genre, data = data) +summary(model_rating_genre) + +#interaction effect between simple genre and term frequency + +model_genre_term = lm(corrected_count ~ book_simple_genre * term_type, data = melted_term_data) +summary(model_genre_term) + +# PLOTS + +# plot terms vs rating + +values_per_rating = function(rating, value, data) { + fdata = subset(data, rating_no == rating) + fdata[, value] +} + +ratings = 1:5 + +data_per_rating = data.frame( + rating = ratings, + hedge_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "hedge", data))}), + pos_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "positive", data))}), + neg_mean = sapply(ratings, function(r) {mean(values_per_rating(r, "negative", data))}) +) + +p = ggplot(data = data_per_rating) + + geom_line(aes(rating, neg_mean, color = "negative"), size=1) + + geom_line(aes(rating, pos_mean, color = "positive"), size=1) + + geom_line(aes(rating, hedge_mean, color = "hedge"), size=1) + + ylab("average frequency") + + labs(color = "term type") + + scale_colour_manual(values= c( + "positive" = "#00cc66", + "hedge" = "#3399ff", + "negative" = "#ff3333" + )) + +p + +# genre and term + +genres = levels(simple_genres) + +mean_per_genre_and_term = function(genre, term) { + genre_data = subset(melted_term_data, melted_term_data$book_simple_genre == genre & melted_term_data$term_type == term) + mean(genre_data$corrected_count, na.rm = TRUE) +} + + +results_per_term = function(term) { + values = sapply(genres, function(g) {mean_per_genre_and_term(g, term)}) + + values +} + +results_per_genre = data.frame( + genre = genres, + positive = results_per_term("positive"), + negative = results_per_term("negative"), + hedge = results_per_term("hedge") +) + + +melted_results_per_genre = melt(results_per_genre, id=c("genre")) + +p = ggplot(data = melted_results_per_genre) + + geom_line(aes(x = variable, y = value, group = genre, color = genre), size =1) + + theme(legend.position ="top") + + labs(y="frequency", x="term type") + + +p + +# plot rating per genre + + + +rating_per_genre = function(genre) { + genre_data = subset(data, data$book_simple_genre == genre) + df = data.frame(table(genre_data$rating_no)) + names(df) = c("rating", "frequency") + df$genre = rep(genre, nrow(df)) + + df +} + +rbind(rating_per_genre(genres[1]), rating_per_genre(genres[2])) + diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/tokenise_data.py b/preprocessing/tokenise_data.py new file mode 100644 index 0000000..7574804 --- /dev/null +++ b/preprocessing/tokenise_data.py @@ -0,0 +1,42 @@ +import csv +from preprocessing.tokeniser import Tokeniser +from tqdm import tqdm + +REVIEWS_FILE = './data/goodreads.csv' + +with open(REVIEWS_FILE) as infile: + outpath = REVIEWS_FILE[:-4] + '_tokenised.csv' + with open(outpath, 'w') as outfile: + reader = csv.DictReader(infile) + fieldnames_in = reader.fieldnames + + fieldnames_out = fieldnames_in + ['tokenised_text'] + writer = csv.DictWriter(outfile, fieldnames_out) + writer.writeheader() + + tokenisers = {} + available_languages = Tokeniser.available_languages() + + for row in tqdm(reader): + #check language and initialise tokeniser if needed + language = row['language'].lower() + if language in tokenisers: + t = tokenisers[language] + elif language in available_languages: + t = Tokeniser(language) + tokenisers[language] = t + else: + t = None + + #if there is a tokeniser... + if t: + #process the review + text = row['text'] + tokens = t.process(text) + tokenised_text = ' '.join(tokens) + row['tokenised_text'] = tokenised_text + else: + row['tokenised_text'] = '' + + #write + writer.writerow(row) diff --git a/preprocessing/tokeniser.py b/preprocessing/tokeniser.py new file mode 100644 index 0000000..a6fd752 --- /dev/null +++ b/preprocessing/tokeniser.py @@ -0,0 +1,52 @@ +import spacy + +class Tokeniser: + def models(): + models = { + "english" : "en_core_web_sm", + "dutch" : "nl_core_news_sm", + "french" : "fr_core_news_sm", + "german" : "de_core_news_sm", + "italian" : "it_core_news_sm", + "portuguese" : "pt_core_news_sm", + "spanish" : "es_core_news_sm" + } + return models + + def available_languages(): + return set(Tokeniser.models().keys()) + + def __init__(self, language): + models = Tokeniser.models() + self.nlp = spacy.load(models[language]) + + def process(self, review: str, lemmatise = True, filter_stopwords = True, filter_ne = True): + doc = self.nlp(review) + + # filter punctuation and digits + is_alpha = lambda token : token.is_alpha + filtered_tokens = [token for token in doc if token.is_alpha] + + # filter named entities + # make an exception for language and nationality names + accepted_ent_types = ['', 'LANGUAGE', 'NORP'] + is_not_NE = lambda token: token.ent_type_ in accepted_ent_types + + # filter stopwords + is_not_stopword = lambda token: token.is_stop == False + + # apply all filters + filters = [is_alpha] + if filter_ne: + filters.append(is_not_NE) + if filter_stopwords: + filters.append(is_not_stopword) + filtered_tokens = [token for token in doc if all(f(token) for f in filters)] + + # convert tokens to lemmas or text + if lemmatise: + words = [token.lemma_.lower() for token in filtered_tokens] + else: + words = [token.text.lower() for token in filtered_tokens] + + return words diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6cc3e72 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,89 @@ +appnope==0.1.2 +backcall==0.1.0 +blis==0.7.4 +boto==2.49.0 +boto3==1.9.101 +botocore==1.12.101 +bz2file==0.98 +catalogue==2.0.1 +certifi==2018.11.29 +chardet==3.0.4 +click==7.1.2 +contextvars==2.4 +cymem==2.0.5 +dataclasses==0.8 +decorator==4.3.2 +docopt==0.6.2 +docutils==0.14 +elasticsearch==7.7.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl +en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl +et-xmlfile==1.0.1 +filelock==3.0.12 +ftfy==5.8 +gensim==3.7.1 +idna==2.8 +immutables==0.15 +importlib-metadata==3.3.0 +ipython==6.5.0 +ipython-genutils==0.2.0 +jedi==0.13.3 +Jinja2==2.11.3 +jmespath==0.9.4 +joblib==1.0.1 +MarkupSafe==1.1.1 +murmurhash==1.0.5 +nltk==3.4 +numpy==1.16.1 +openpyxl==3.0.7 +packaging==20.9 +pandas==1.1.5 +parso==0.3.4 +pathy==0.3.6 +pep517==0.10.0 +pexpect==4.6.0 +pickleshare==0.7.5 +pip-tools==6.0.1 +plac==1.1.3 +preshed==3.0.5 +prompt-toolkit==1.0.15 +ptyprocess==0.6.0 +pydantic==1.7.3 +Pygments==2.3.1 +pykwalify==1.7.0 +pyparsing==2.4.7 +python-dateutil==2.8.0 +pytz==2021.1 +regex==2020.11.13 +requests==2.21.0 +ruamel.yaml==0.15.100 +s3transfer==0.2.0 +sacremoses==0.0.43 +scikit-learn==0.24.1 +scipy==1.2.1 +simplegeneric==0.8.1 +singledispatch==3.4.0.3 +six==1.12.0 +sklearn==0.0 +smart-open==1.8.0 +spacy==3.0.1 +spacy-alignments==0.7.2 +spacy-legacy==3.0.1 +spacy-transformers==1.0.1 +srsly==2.4.0 +thinc==8.0.1 +threadpoolctl==2.1.0 +tokenizers==0.9.4 +toml==0.10.2 +torch==1.7.1 +torchcontrib==0.0.2 +tqdm==4.54.1 +traitlets==4.3.2 +transformers==4.2.2 +typer==0.3.2 +typing==3.6.6 +typing-extensions==3.7.4.3 +urllib3==1.24.1 +wasabi==0.8.2 +wcwidth==0.1.7 +zipp==3.4.0 \ No newline at end of file diff --git a/sentiment/__init__.py b/sentiment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sentiment/sentiment_analysis.py b/sentiment/sentiment_analysis.py new file mode 100644 index 0000000..e0b952f --- /dev/null +++ b/sentiment/sentiment_analysis.py @@ -0,0 +1,60 @@ +from transformers import pipeline +import pandas as pd +from os.path import isfile + +languages = ['English', 'Dutch', 'German', 'French', 'Italian', 'Spanish'] +selected_columns = ['id', 'language', 'rating_no', 'sentiment'] +out_csv = 'reviews_sentiment_text.csv' + +def analyze_sentiment(review_file): + reviews = pd.read_csv(review_file) + start_from = None + data = reviews[(reviews['text'].notna()) & (reviews['language'].isin(languages))].sample(10000, random_state=21) + data['sentiment'] = data.apply(lambda x: sentiment_classification(x['rating_no']), axis=1) + classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment") + if isfile(out_csv): + done_records = pd.read_csv(out_csv) + start_from = len(done_records.index) + for i, row in data.iterrows(): + write_header = True if i==0 else False + output = pd.DataFrame.from_dict({key: [row[key]] for key in selected_columns}) + if start_from: + if i < start_from: + continue + try: + analysis = classifier(row['tokenised_text'][:512]) + output['prediction'] = int(analysis[0]['label'][:1]) + except: + output['prediction'] = None + + output.to_csv(out_csv, mode='a', header=write_header, index=False) + +def sentiment_classification(rating_no): + if rating_no >= 4: + sentiment = 'P' + elif rating_no <= 2: + sentiment = 'N' + else: + sentiment = '-' + return sentiment + +def calculate_accuracy(sentiment_file=out_csv): + data = pd.read_csv(sentiment_file) + data['pred_sentiment'] = data.apply(lambda x: sentiment_classification(x['prediction']), axis=1) + data['diff'] = abs(data['rating_no']-data['prediction']) + print('percentage exact label correct: ', len(data[data['rating_no']==data['prediction']])/len(data)) + print('percentage sentiment label correct: ', len(data[data['sentiment']==data['pred_sentiment']])/len(data)) + print('one off accuracy: ', len(data[data['diff']<2])/len(data)) + +""" +exact: 0.34 +correct category: 0.53 +one-off: 0.64 +""" + +""" +Full text scores (sample of 10000 reviews) +percentage exact label correct: 0.3316 +percentage sentiment label correct: 0.5308 +one off accuracy: 0.6458 +""" \ No newline at end of file diff --git a/sentiment/sentiment_classification.py b/sentiment/sentiment_classification.py new file mode 100644 index 0000000..e8ea788 --- /dev/null +++ b/sentiment/sentiment_classification.py @@ -0,0 +1,37 @@ +import openpyxl +import pandas as pd + +LANGUAGES = ['English', 'Dutch', 'French', + 'German', 'Italian', 'Portuguese', 'Spanish'] + + +def create_dataframes(infile): + wb = openpyxl.load_workbook(filename=infile) + sheet_names = wb.sheetnames + for lang in LANGUAGES: + # find all sheets of the language, but don't use the non-annotated ones + lang_sheets = [s for s in sheet_names if s.startswith( + lang) and len(s) > len(lang)] + out_df = pd.DataFrame() + for i, key in enumerate(lang_sheets): + sheet = wb[key] + values = sheet.values + df = pd.DataFrame(values, columns=next(values)).head(100) + if i == 0: + out_df['word'] = df['Word'] + out_df[key] = df.apply( + lambda x: sentiment_classification(x['Category']), axis=1) + out_df.to_csv('{}_ratings.csv'.format(lang), index=False) + + +def sentiment_classification(label): + if not label: + return None + elif label.lower().startswith('p'): + return 'P' + elif label.lower().startswith('n'): + return 'N' + elif label.lower().startswith('h') or label.lower().startswith('c'): + return 'H' + else: + return None diff --git a/sentiment/sentiment_collocations.py b/sentiment/sentiment_collocations.py new file mode 100644 index 0000000..d8d34aa --- /dev/null +++ b/sentiment/sentiment_collocations.py @@ -0,0 +1,63 @@ +import pandas as pd +from collections import Counter +import re +from os.path import isfile +from os import remove + +LANGUAGES_PATTERNS = { + 'dutch': r'^vertaa?l', + 'english': r'^translat', + 'french': r'^tradu', + 'german': r'[uü]bersetz', + 'italian': r'^tradu', + 'portuguese': r'^tradu', + 'spanish': r'^tradu', +} + +LANGUAGES = ['English', 'Dutch', 'German', 'French', 'Spanish'] +SENTIMENTS_FILE = 'collocations_sentiments.csv' +WINDOW_SIZE = 4 +INPUT_FACTORS = ['id', 'original_language', 'edition_language', 'book_title', 'language', + 'age_category', 'book_genre', 'rating_no', 'is_translated', 'mention_count'] +OUTPUT_FILE = 'reviews_PHN.csv' + + +def create_lemma_valence_list(): + output_list = [] + for lang in LANGUAGES: + df = pd.read_csv('{}_ratings.csv'.format(lang), dtype='category') + for i, row in df.iterrows(): + cats = Counter(row[1:]).most_common(1) + # if at least two annotators agree and the category is not NaN + if cats[0][1] >= 2 and isinstance(cats[0][0], str): + output_list.append( + {'word': row['word'], 'language': lang, 'category': cats[0][0]}) + output_df = pd.DataFrame(output_list) + output_df.to_csv(SENTIMENTS_FILE, index=False) + + +def count_sentiments(reviews_file): + reviews = pd.read_csv(reviews_file) + sentiments = pd.read_csv(SENTIMENTS_FILE) + if isfile(OUTPUT_FILE): + # remove earlier file, since we write out in append mode + remove(OUTPUT_FILE) + write_header = True + for i, row in reviews.iterrows(): + if row['mentions_translation'] and row['language'] in LANGUAGES: + words = row['tokenised_text'].split(" ") + pattern = LANGUAGES_PATTERNS[row['language'].lower()] + data = {factor: row[factor] for factor in INPUT_FACTORS} + data.update({'P': 0, 'H': 0, 'N': 0}) + relevant_sentiments = sentiments[sentiments['language'] + == row['language']] + for k, word in enumerate(words): + if re.search(pattern, word): + relevant_words = [words[j] for j in range( + k - WINDOW_SIZE, k + WINDOW_SIZE + 1) if 0 <= j < len(words)] + for m, sen in relevant_sentiments.iterrows(): + if sen['word'] in relevant_words: + data[sen['category']] += 1 + output = pd.DataFrame(data, index=[i]) + output.to_csv(OUTPUT_FILE, mode='a', header=write_header) + write_header = False