CentreForDigitalHumanities · Meesch · Oct 8, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -6,7 +6,7 @@
     "configurations": [
         {
             "name": "django: runserver",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
             "args": ["runserver"],
@@ -15,7 +15,7 @@
         },
         {
             "name": "django: shell",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
             "args": ["shell"],
@@ -24,10 +24,19 @@
         },
         {
             "name": "django: index",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
-            "args": ["index", "${input:corpusName}"],
+            "args": ["index", "${input:corpusName}", "-d"],
+            "django": true,
+            "justMyCode": true
+        },
+        {
+            "name": "django: loadcorpora",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/backend/manage.py",
+            "args": ["loadcorpora"],
             "django": true,
             "justMyCode": true
         },

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
@@ -23,32 +23,42 @@ def get_language_key(language_code):
 
     return Language.make(standardize_tag(language_code)).display_name().lower()
 
-def _stopwords_directory() -> str:
-    stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
-    if not os.path.exists(stopwords_dir):
+def _nltk_stopwords_directory() -> str:
+    nltk_stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
+    if not os.path.exists(nltk_stopwords_dir):
         nltk.download('stopwords', settings.NLTK_DATA_PATH)
-    return stopwords_dir
+    return nltk_stopwords_dir
 
-def _stopwords_path(language_code: str):
-    dir = _stopwords_directory()
+def _nltk_stopwords_path(language_code: str):
+    dir = _nltk_stopwords_directory()
+    language = get_language_key(language_code)
+    return os.path.join(dir, language)
+
+def _supplementary_path(language_code: str):
+    dir = os.path.join(settings.BASE_DIR, 'addcorpus', 'stopword_data', 'supplementary_data')
     language = get_language_key(language_code)
     return os.path.join(dir, language)
 
 def stopwords_available(language_code: str) -> bool:
     if not language_code:
         return False
-    path = _stopwords_path(language_code)
-    return os.path.exists(path)
-
-def get_nltk_stopwords(language_code):
-    path = _stopwords_path(language_code)
-
-    if os.path.exists(path):
-        with open(path) as infile:
+    nltk_path = _nltk_stopwords_path(language_code)
+    supplementary_path = _supplementary_path(language_code)
+    return True if (os.path.exists(nltk_path) or os.path.exists(supplementary_path)) else False
+
+def get_stopwords(language_code):
+    nltk_path = _nltk_stopwords_path(language_code)
+    supplementary_path = _supplementary_path(language_code)
+    if os.path.exists(nltk_path):
+        with open(nltk_path) as infile:
+            words = [line.strip() for line in infile.readlines()]
+            return words
+    elif os.path.exists(supplementary_path):
+        with open(supplementary_path) as infile:
             words = [line.strip() for line in infile.readlines()]
             return words
     else:
-        raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code))
+        raise NotImplementedError('language {} has no stopwords list'.format(language_code))
 
 def add_language_string(name, language):
     return '{}_{}'.format(name, language) if language else name
@@ -87,6 +97,8 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
 
         if stopword_analysis or stemming_analysis:
             if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language):
+                warnings.warn('You specified `stopword_analysis=True`, but \
+                                      there are no stopwords available for this language')
                 continue # skip languages for which we do not have a stopword list
 
             if stopword_analysis:
@@ -119,7 +131,7 @@ def number_filter():
 
 def make_stopword_filter(language):
     try:
-        stopwords = get_nltk_stopwords(language)
+        stopwords = get_stopwords(language)
         return {
             "type": "stop",
             'stopwords': stopwords

diff --git a/backend/addcorpus/stopword_data/supplementary_data/README.md b/backend/addcorpus/stopword_data/supplementary_data/README.md
@@ -0,0 +1,12 @@
+## Supplementary Data Sources
+Source 1: For Bulgarian, Czech, Croatian, Galician, Latvian, and Ukrainian, stopword lists were downloaded from this [Github repository](https://github.com/negapedia/nltk/tree/master/corpora/stopwords), by [Marco Chilese](https://github.com/MarcoChilese). The stopword lists are a combination of nltk stopwords (where available) and stopwords from [ranks.nl](https://www.ranks.nl/stopwords/). They were downloaded on 2025-12-18.
+
+Source 2: For Bosnian stopwords, the following publication was used: Sead Jahić, & Jernej Vičič. (2023). Lists of stopwords, polarity shifters and AnAwords of Bosnian language [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10373141
+
+Source 3: For Estonian stopwords, the following Github repository was used: https://github.com/stopwords-iso/stopwords-et?tab=readme-ov-file. 
+
+Source 4: For Icelandic stopwords, the following Github repository was used: https://github.com/ViktorMS/stoppord/blob/master/stoppord.csv
+
+Source 5: For Serbian stopwords, the following Github repository was used: https://github.com/Xangis/extra-stopwords/blob/master/serbian
+
+Source 6: For Slovenian stopwords, the following Github repository was used: https://github.com/stopwords-iso/stopwords-sl/blob/master/raw/gh-stopwords-json-sl.txt