Merge pull request #21 from UUDigitalHumanitieslab/develop

Update master branch
UUDigitalHumanitieslab · Mar 25, 2024 · a81b7c3 · a81b7c3
2 parents 3c0f32f + 8120290
commit a81b7c3
Show file tree

Hide file tree

Showing 33 changed files with 1,485 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,9 @@
 /XML
 /CSV
 *.csv
+/model/.R*
+/embeddings/embeddings
+/embeddings/.env
+/embeddings/eval/questions-words.txt
+/data/
+/sentiment/*.csv
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 UU Centre for Digital Humanities - Research Software Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,5 +1,30 @@
-# Reader-responses-to-translated-literature
+# Reader responses to translated literature
 
-This repo contains the work we do in the Research-IT project with Haidee Kotze and Gys-Walt van Egdom.
+This repository contains code for the [DIOPTRA-L](https://cdh.uu.nl/portfolio/digital-opinions-on-translated-literature-dioptra-l-2/) project by Haidee Kotze, Gys-Walt van Egdom, Corina Koolen and Utrecht University's Research Software Lab, and can be used to reproduce the publication
+Kotze, Haidee & Janssen, Berit & Koolen, Corina & Plas, Luka & Egdom, Gys-Walt. (2021). Norms, affect and evaluation in the reception of literary translations in multilingual online reading communities: Deriving cognitive-evaluative templates from big data. Translation, Cognition & Behavior. 4. 10.1075/tcb.00060.kot.
+
+## Prerequisites
+### Python
+Most of the scripts require Python 3.6. To install dependencies, run
+`pip install -r requirements.txt`
+
+### R
+The statistical analysis and visualization was performed in R, using the following libraries:
+- coin
+- dplyr
+- ggplot2
+- Hmisc
+- irr
+- lme4
+- reshape2
+- rstatix
+
+## Steps to reproduce
+1. scrapers: Python scripts used to scrape reviews from Goodreads. Documentation on usage in that folder's README.
+2. preprocessing: Python scripts used to clean the data, and more specifically, tokenization.
+3. embeddings: Jupyter notebooks for training and evaluating word embeddings using word2vec. As the dataset is relatively small, the resulting embeddings were not informative for further research.
+4. analysis: Python scripts to collect and count translation lemmas, based on human annotations.
+5. collocations: Python scripts for finding collocations surrounding translation lemmas
+6. sentiment: Python scripts to count positive / negative and hedge terms in collocations.
+7. model: R scripts used to generate statistics and visualizations of the data.
 
-For now, it contains a GoodReads scraper, in the 'scrapers' folder. See the README over there if you want to do some scraping.
diff --git a/analysis/__init__.py b/analysis/__init__.py
diff --git a/analysis/count_data.py b/analysis/count_data.py
@@ -0,0 +1,130 @@
+import csv
+import re
+import pandas as pd
+from typing import List, Dict
+from collections import defaultdict
+from collocations.patterns import LANGUAGES_PATTERNS
+
+WORDS_PATH = './data/word_classes.csv'
+INPUT_FACTORS = ['original_language', 'edition_language',
+                 'book_title', 'language', 'age_category', 'book_genre', 'rating_no']
+WINDOW_SIZE = 4
+
+
+def read_categories(words_path):
+    with open(WORDS_PATH) as words_file:
+        reader = csv.DictReader(words_file)
+        word_categories = {language: defaultdict(
+            lambda: 'other') for language in LANGUAGES_PATTERNS}
+
+        for row in reader:
+            language = row['language']
+            word = row['word']
+            cat = row['category']
+            word_categories[language][word] = cat
+
+    categories_generator = (
+        value for lang_data in word_categories.values() for value in lang_data.values())
+    categories = list(sorted(set(categories_generator)))
+
+    return word_categories, categories
+
+
+def output_count(words: List[str], word_categories: Dict, categories: List):
+    counts = {cat: 0 for cat in categories}
+
+    for word in words:
+        cat = word_categories[word]
+        counts[cat] += 1
+
+    return counts
+
+
+def count_data(reviews_path, language='english'):
+    '''Create a table with one row for each mention of "translation". Includes
+    some info about the review and the categories of words in the context window.'''
+    # import word categories
+    word_categories, categories = read_categories(WORDS_PATH)
+
+    # import reviews
+    with open(reviews_path) as reviews_file:
+        reader = csv.DictReader(reviews_file)
+
+        all_data = []
+
+        for row in reader:
+            text = row['tokenised_text']
+            language = row['language'].lower()
+            if language in LANGUAGES_PATTERNS:
+                pattern = LANGUAGES_PATTERNS[language]
+                words = text.split()
+                input_data = {factor: row[factor] for factor in INPUT_FACTORS}
+
+                for i, word in enumerate(words):
+                    if re.search(pattern, word):
+                        preceding = [words[j]
+                                     for j in range(i - WINDOW_SIZE, i) if j >= 0]
+                        following = [words[j] for j in range(
+                            i + 1, i + 1 + WINDOW_SIZE) if j < len(words)]
+                        window = preceding + following
+                        output_data = output_count(
+                            window, word_categories[language], categories)
+
+                        data = {**input_data, **output_data}
+                        all_data.append(data)
+
+    df = pd.DataFrame(all_data, columns=INPUT_FACTORS + categories)
+    return df
+
+
+def mentions_translation(text, language):
+    if language in LANGUAGES_PATTERNS:
+        pattern = LANGUAGES_PATTERNS[language]
+        words = text.split()
+        return str(int(any(re.search(pattern, word) for word in words)))
+    else:
+        return None
+
+
+def count_translation_mentions(text, language):
+    if language in LANGUAGES_PATTERNS:
+        pattern = LANGUAGES_PATTERNS[language]
+        words = text.split()
+        return str(sum(1 for word in words if re.search(pattern, word)))
+    else:
+        return None
+
+
+def count_data_per_review(reviews_path):
+    '''Create table with one row for each review. Similar to the 
+    I-Analyzer output, but with some extra info. We also ignore some
+    columns like the full text.'''
+
+    with open(reviews_path) as reviews_file:
+        reader = csv.DictReader(reviews_file)
+
+        def review_data(row):
+            input_data = {factor: row[factor] for factor in INPUT_FACTORS}
+            is_translated = int(row['original_language']
+                                != row['edition_language'])
+            mentions = mentions_translation(
+                row['tokenised_text'], row['language'].lower())
+            mention_count = count_translation_mentions(
+                row['tokenised_text'], row['language'].lower())
+            words = len(row['text'].split())
+            cleaned_words = len(row['tokenised_text'].split())
+            data = {
+                **input_data,
+                'is_translated': is_translated,
+                'mentions_translation': mentions,
+                'mention_count': mention_count,
+                'words': words,
+                'cleaned_words': cleaned_words
+            }
+            return data
+
+        all_data = [review_data(row) for row in reader]
+
+    df = pd.DataFrame(all_data, columns=INPUT_FACTORS +
+                      ['is_translated', 'mentions_translation', 'mention_count', 'words', 'cleaned_words'])
+    return df
diff --git a/analysis/write_review_table.py b/analysis/write_review_table.py
@@ -0,0 +1,10 @@
+# write table with 1 row per review
+
+import pandas as pd
+from analysis.count_data import count_data_per_review
+
+reviews_path = './data/goodreads_tokenised.csv'
+export_path = './data/goodreads_review_data.csv'
+
+data = count_data_per_review(reviews_path)
+data.to_csv(export_path, na_rep='NA')
diff --git a/analysis/write_table.py b/analysis/write_table.py
@@ -0,0 +1,10 @@
+# write table with 1 row per mention of translation
+
+import pandas as pd
+from analysis.count_data import count_data
+
+reviews_path = './data/goodreads_tokenised.csv'
+export_path = './data/goodreads_formatted.csv'
+
+data = count_data(reviews_path)
+data.to_csv(export_path)
diff --git a/collocations/__init__.py b/collocations/__init__.py
diff --git a/collocations/__main__.py b/collocations/__main__.py
@@ -0,0 +1,6 @@
+from collocations.collocations import collocate
+from collocations.patterns import LANGUAGES_PATTERNS
+
+for language, pattern in LANGUAGES_PATTERNS.items():
+    if pattern:
+        collocate(language, pattern)
diff --git a/collocations/collocations.py b/collocations/collocations.py
@@ -0,0 +1,70 @@
+import csv
+import re
+from math import log
+
+def collocate(language, target_pattern, reviews_path = './data/goodreads_tokenised.csv', out_name = 'collocations', out_dir = './data'):
+
+    out_path = '{}/{}_{}.txt'.format(out_dir, out_name, language)
+
+    # import reviews
+
+    with open(reviews_path) as csvfile:
+        reader = csv.DictReader(csvfile)
+        reviews_text = (row["tokenised_text"] for row in reader if row["language"].lower() == language)
+        reviews = [review.split() for review in reviews_text]
+
+    # frequencies
+
+    vocab = set(word for review in reviews for word in review)
+
+    def count_words():
+        counts_general = {word: 0 for word in vocab}
+        counts_translat =  {word: 0 for word in vocab}
+        window = 4
+
+        for review in reviews:
+            for i, word in enumerate(review):
+                counts_general[word] += 1
+
+                if re.search(target_pattern, word):
+                    preceding = [review[j] for j in range(i - window, i) if j >= 0]
+                    following = [review[j] for j in range(i + 1, i + 1 + window) if j < len(review)]
+
+                    for neighbour in preceding + following:
+                        counts_translat[neighbour] += 1
+
+        return counts_translat, counts_general
+
+    def filter_counts(target, general):
+        filtered_vocab = set(word for word, count in general.items() if count > 1)
+        filtered_target = {word: count for word, count in target.items() if word in filtered_vocab}
+        filtered_general = {word: count for word, count in general.items() if word in filtered_vocab}
+
+        return filtered_target, filtered_general
+
+    def relative_counts(counts):
+        total = sum(counts.values())
+        return {word: count / total for word, count in counts.items()}
+
+    def counts_log(counts):
+        return {word: log(count + 1) for word, count in counts.items()}
+
+    def relative_frequencies(target, general):
+        target_log = counts_log(relative_counts(target))
+        general_log = counts_log(relative_counts(general))
+
+        return {word: target_log[word] - general_log[word] for word in target}
+
+    counts_translat, counts_general = filter_counts(*count_words())
+    rel_freq = relative_frequencies(counts_translat, counts_general)
+
+    def sort_by_frequency(counts):
+        return sorted(counts, key = lambda w : counts[w], reverse = True)
+
+    ranking = sort_by_frequency(rel_freq)
+
+    #export
+    with open(out_path, 'w') as outfile:
+        for i in range(100):
+            outfile.write(ranking[i])
+            outfile.write('\n')
diff --git a/collocations/patterns.py b/collocations/patterns.py
@@ -0,0 +1,9 @@
+LANGUAGES_PATTERNS = {
+    'dutch' :  r'^vertaa?l',
+    'english' : r'^translat',
+    'french' : r'^tradu',
+    'german' : r'[uü]bersetz',
+    'italian' : r'^tradu',
+    'portuguese' : r'^tradu',
+    'spanish' : r'^tradu',
+}
diff --git a/collocations/test_patterns.py b/collocations/test_patterns.py
@@ -0,0 +1,26 @@
+# print the set of words that match the pattern for each language
+
+import csv
+import re
+from collocations.patterns import LANGUAGES_PATTERNS
+
+reviews_path = './data/goodreads_tokenised.csv'
+
+matching_words = {language: set() for language in LANGUAGES_PATTERNS}
+
+with open(reviews_path) as csvfile:
+    reader = csv.DictReader(csvfile)
+    for row in reader:
+        language = row['language'].lower()
+        words = row['tokenised_text'].split()
+
+        is_match = lambda word: re.search(LANGUAGES_PATTERNS[language], word)
+        matches = filter(is_match, words)
+        for match in matches:
+            matching_words[language].add(match)
+
+for language in matching_words:
+    print(language.upper())
+    print()
+    print(matching_words[language])
+    print()
diff --git a/embeddings/eval/__init__.py b/embeddings/eval/__init__.py
diff --git a/embeddings/eval/__main__.py b/embeddings/eval/__main__.py
@@ -0,0 +1,25 @@
+from gensim import models
+
+EMBEDDINGS_PATH = "embeddings/with_pretraining"
+
+model = models.Word2Vec.load(EMBEDDINGS_PATH)
+
+#automatic evaluation
+print('QUESTION ANSWER DATASET')
+accuracy = model.accuracy("eval/questions-words.txt")
+
+for section in accuracy:
+    name = section['section']
+    correct = len(section['correct'])
+    incorrect = len(section['incorrect'])
+    print('{}: {} out of {}'.format(name, correct, incorrect))
+
+
+#manual inspection
+print()
+words = ['book', 'author', 'genre', 'boring', 'recommend', 'translation']
+
+print('WORD SIMILARITES')
+for word in words:
+    neighbours = [neighbour for neighbour, score in model.most_similar(word)]
+    print("'{}' is most similar to '{}', '{}', or '{}'".format(word, neighbours[0], neighbours[1], neighbours[2]))
diff --git a/embeddings/readme.md b/embeddings/readme.md
@@ -0,0 +1,32 @@
+English embeddings for goodreads reviews.
+
+Setup for a python 3.6 environment:
+```bash
+pip install gensim==3.8
+pip install nltk
+python -m train.download
+```
+The train.download module downloads the Brown corpus for pretraining and a question-answer dataset for evaluation. The reviews have to be downloaded manually from I-analyzer. 
+
+Embeddings can be trained from the command line:
+```bash
+python -m train
+```
+There are no command line options, so edit `train/__main__.py` to configure hyperparameters.
+
+To evaluate:
+```bash
+python -m eval
+```
+Check that the code in `eval/__main__.py` uses the right embeddings.
+
+
+For the visualisation of results:
+Setup with
+```bas
+pip install jupyter
+pip install matplotlib
+pip install sklearn
+```
+
+Open `visualise/plot.ipynb` as a jupyter notebook.
diff --git a/embeddings/train/__init__.py b/embeddings/train/__init__.py