From 132377dc537924150f1129c355ed44f4cf110aae Mon Sep 17 00:00:00 2001
From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com>
Date: Thu, 4 Dec 2025 12:23:58 +0100
Subject: [PATCH 1/4] Update db_operations.py

---
 opustools_pkg/opustools/db_operations.py | 385 +++++++++++++++++------
 1 file changed, 292 insertions(+), 93 deletions(-)

diff --git a/opustools_pkg/opustools/db_operations.py b/opustools_pkg/opustools/db_operations.py
index 5060d5c..be8fbe9 100644
--- a/opustools_pkg/opustools/db_operations.py
+++ b/opustools_pkg/opustools/db_operations.py
@@ -1,136 +1,335 @@
 import os
 import sqlite3
+from typing import Any, Dict, List, Tuple, Optional
+
 
 class DbOperations:
+    def __init__(self, db_file: Optional[str] = None) -> None:
+        # Resolve DB path
+        self.db_file = db_file or os.environ.get("OPUSAPI_DB")
+        if not self.db_file:
+            raise ValueError("OPUSAPI_DB environment variable or db_file must be set")
+
+        # Single, reusable connection
+        # check_same_thread=False allows usage from different threads if your app does that
+        self.conn = sqlite3.connect(self.db_file, check_same_thread=False)
+        self.conn.row_factory = sqlite3.Row
+
+        # Performance/safety PRAGMAs – run once per connection
+        self._configure_connection()
+
+    def _configure_connection(self) -> None:
+        # WAL improves concurrency and often speed
+        self.conn.execute("PRAGMA journal_mode = WAL;")
+        # Good balance of durability and performance
+        self.conn.execute("PRAGMA synchronous = NORMAL;")
+        # Keep temp structures in memory
+        self.conn.execute("PRAGMA temp_store = MEMORY;")
+        # Enforce FK constraints
+        self.conn.execute("PRAGMA foreign_keys = ON;")
+
+    def close(self) -> None:
+        if getattr(self, "conn", None) is not None:
+            self.conn.close()
+            self.conn = None
+
+    def __enter__(self) -> "DbOperations":
+        return self
 
-    def __init__(self, db_file=None):
-        if db_file:
-            self.db_file = db_file
-        else:
-            self.db_file = os.environ.get('OPUSAPI_DB')
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
 
-    def clean_up_parameters(self, parameters):
-        remove = []
-        valid_keys = ['corpus', 'id', 'latest', 'preprocessing', 'source', 'target', 'version', 'corpora', 'languages']
+    def __del__(self) -> None:
+        # Best-effort cleanup
+        try:
+            self.close()
+        except Exception:
+            pass
+
+    # -------------------------
+    # Utility helpers
+    # -------------------------
+
+    def clean_up_parameters(self, parameters: Dict[str, str]) -> Dict[str, str]:
+        """
+        Keep only allowed keys, strip quote characters from values.
+        """
+        remove: List[str] = []
+        valid_keys = [
+            "corpus",
+            "id",
+            "latest",
+            "preprocessing",
+            "source",
+            "target",
+            "version",
+            "corpora",
+            "languages",
+        ]
         for key, value in parameters.items():
             if key not in valid_keys:
                 remove.append(key)
                 continue
-            value = value.replace('"', '')
-            value = value.replace('\'', '')
+            # These replacements are mostly unnecessary with parameterized queries,
+            # but kept for backward-compatibility.
+            value = value.replace('"', "")
+            value = value.replace("'", "")
             parameters[key] = value
         for key in remove:
             del parameters[key]
         return parameters
 
-    def run_query(self, sql_command):
-        conn = sqlite3.connect(self.db_file)
-        query = conn.execute(sql_command)
-        value_list = query.fetchall()
-        keys = [i[0] for i in query.description]
-        conn.close()
-        return keys, value_list
+    def _build_where(
+        self, parameters: Dict[str, Any]
+    ) -> Tuple[str, List[Any]]:
+        """
+        Build a WHERE clause and parameter list from a dict like {"source": "en", "target": "de"}.
+        Returns (clause_string, values_list), e.g. ("source = ? AND target = ?", ["en", "de"])
+        """
+        clauses: List[str] = []
+        values: List[Any] = []
+        for k, v in parameters.items():
+            clauses.append(f"{k} = ?")
+            values.append(v)
+        return " AND ".join(clauses), values
+
+    def run_query(
+        self, sql_command: str, params: Optional[List[Any]] = None
+    ) -> Tuple[List[str], List[sqlite3.Row]]:
+        """
+        Execute a query and return (column_names, list_of_rows).
+        """
+        cur = self.conn.execute(sql_command, params or [])
+        rows = cur.fetchall()
+        keys = [d[0] for d in cur.description]
+        return keys, rows
+
+    # -------------------------
+    # Public query helpers
+    # -------------------------
+
+    def sort_source_target(self, parameters: Dict[str, str]) -> Dict[str, str]:
+        source = parameters.get("source")
+        target = parameters.get("target")
+        if source and target:
+            sou_tar = sorted([source, target])
+            parameters["source"] = sou_tar[0]
+            parameters["target"] = sou_tar[1]
+        return parameters
+
+    def convert_latest(self, parameters: Dict[str, str]) -> Dict[str, str]:
+        version = parameters.get("version")
+        if version and version == "latest":
+            parameters["latest"] = "True"
+            del parameters["version"]
+        return parameters
+
+    def run_default_query(
+        self, parameters: Dict[str, str], suffix: str = ""
+    ) -> List[Dict[str, Any]]:
+        columns = [
+            "alignment_pairs",
+            "corpus",
+            "documents",
+            "id",
+            "latest",
+            "preprocessing",
+            "size",
+            "source",
+            "source_tokens",
+            "target",
+            "target_tokens",
+            "url",
+            "version",
+        ]
 
-    def run_default_query(self, parameters, suffix=''):
-        columns = ['alignment_pairs', 'corpus', 'documents', 'id', 'latest', 'preprocessing', 'size', 'source', 'source_tokens', 'target', 'target_tokens', 'url', 'version']
+        # We mutate parameters in this method, so work on a local copy
+        parameters = dict(parameters)
         parameters = self.sort_source_target(parameters)
-        sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + suffix
-        keys, value_list = self.run_query(sql_command)
-        ret = [{k: v for k, v in zip(keys,values)} for values in value_list]
-        if 'preprocessing' not in parameters.keys() and parameters.get('target'):
-            param_mono_src = parameters.copy()
-            param_mono_src['target'] = ''
-            sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_src.items()]) + suffix
-            keys, value_list = self.run_query(sql_command)
-            ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list]
-
-            param_mono_trg = parameters.copy()
-            param_mono_trg['source'] = parameters['target']
-            param_mono_trg['target'] = ''
-            sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_trg.items()]) + suffix
-            keys, value_list = self.run_query(sql_command)
-            ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list]
+
+        where_clause, values = self._build_where(parameters)
+
+        sql_command = f"SELECT {', '.join(columns)} FROM opusfile"
+        if where_clause:
+            sql_command += " WHERE " + where_clause
+        if suffix:
+            # suffix usually like " AND target != ''" or " AND source != ''"
+            sql_command += suffix
+
+        keys, rows = self.run_query(sql_command, values)
+        ret = [{k: row[idx] for idx, k in enumerate(keys)} for row in rows]
+
+        # Monolingual fetches when target is present but preprocessing not specified
+        if "preprocessing" not in parameters.keys() and parameters.get("target"):
+            # 1) Mono source: target = ''
+            param_mono_src = dict(parameters)
+            param_mono_src["target"] = ""
+            where_clause, values = self._build_where(param_mono_src)
+            sql_command = (
+                f"SELECT {', '.join(columns)} FROM opusfile WHERE {where_clause}"
+            ) + suffix
+            keys, rows = self.run_query(sql_command, values)
+            ret.extend({k: row[idx] for idx, k in enumerate(keys)} for row in rows)
+
+            # 2) Mono target: swap source/target, then set target = ''
+            param_mono_trg = dict(parameters)
+            param_mono_trg["source"] = parameters["target"]
+            param_mono_trg["target"] = ""
+            where_clause, values = self._build_where(param_mono_trg)
+            sql_command = (
+                f"SELECT {', '.join(columns)} FROM opusfile WHERE {where_clause}"
+            ) + suffix
+            keys, rows = self.run_query(sql_command, values)
+            ret.extend({k: row[idx] for idx, k in enumerate(keys)} for row in rows)
 
         return ret
 
-    def run_corpora_query(self, parameters):
+    def run_corpora_query(self, parameters: Dict[str, str]) -> List[str]:
+        # Work on a copy to avoid side effects
+        parameters = dict(parameters)
         parameters = self.convert_latest(parameters)
         parameters = self.sort_source_target(parameters)
-        if 'corpora' in parameters.keys():
-            del parameters['corpora']
+        parameters.pop("corpora", None)
+
+        sql_command = "SELECT DISTINCT corpus FROM opusfile"
+        where_clause, values = self._build_where(parameters)
+
+        if where_clause:
+            sql_command += " WHERE " + where_clause
 
-        sql_command = 'SELECT DISTINCT corpus FROM opusfile'
-        if len(parameters) > 0:
-            sql_command = sql_command+' WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()])
-        _, value_list = self.run_query(sql_command)
-        values = [v[0] for v in value_list]
-        return values
+        _, rows = self.run_query(sql_command, values)
+        return [row[0] for row in rows]
 
-    def run_languages_query(self, parameters):
+    def run_languages_query(self, parameters: Dict[str, str]) -> List[str]:
+        # Work on a copy to avoid mutating caller's dict
+        parameters = dict(parameters)
         parameters = self.convert_latest(parameters)
         parameters = self.sort_source_target(parameters)
-        if 'languages' in parameters.keys():
-            del parameters['languages']
-
-        sql_command = 'SELECT DISTINCT source FROM opusfile '
-        if len(parameters) > 0:
-            source = parameters.get('source')
-            if source:
-                sql_command = 'SELECT DISTINCT target FROM opusfile where '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + f" AND target != '{source}' AND target != '' UNION SELECT DISTINCT source FROM opusfile "
-                parameters['target'] = parameters['source']
-                del parameters['source']
-            sql_command = sql_command + 'WHERE '
-        sql_command = sql_command + ' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()])
-        _, value_list = self.run_query(sql_command)
-        values = [v[0] for v in value_list]
-        return values
-
-    def sort_source_target(self, parameters):
-        source = parameters.get('source')
-        target = parameters.get('target')
-        if source and target:
-            sou_tar = sorted([source, target])
-            parameters['source'] = sou_tar[0]
-            parameters['target'] = sou_tar[1]
-        return parameters
+        parameters.pop("languages", None)
 
-    def convert_latest(self, parameters):
-        version = parameters.get('version')
-        if version and version == 'latest':
-            parameters['latest'] = 'True'
-            del parameters['version']
-        return parameters
+        # If there are no filters, just list sources
+        if not parameters:
+            sql_command = "SELECT DISTINCT source FROM opusfile"
+            _, rows = self.run_query(sql_command)
+            return [row[0] for row in rows]
+
+        base_params = dict(parameters)
+        source = base_params.get("source")
+
+        # Case 1: source specified -> UNION of target and source
+        if source:
+            # First part: DISTINCT target with filters + target != source/''
+
+            where1_clause_parts: List[str] = []
+            where1_values: List[Any] = []
+            for k, v in base_params.items():
+                where1_clause_parts.append(f"{k} = ?")
+                where1_values.append(v)
+            where1_clause = " AND ".join(where1_clause_parts)
+
+            sql1 = (
+                "SELECT DISTINCT target FROM opusfile WHERE "
+                + where1_clause
+                + " AND target != ? AND target != ''"
+            )
+            where1_values.extend([source])
+
+            # Second part: DISTINCT source with source/target swapped
+            swapped = dict(base_params)
+            swapped["target"] = swapped["source"]
+            del swapped["source"]
+            where2_clause, where2_values = self._build_where(swapped)
+
+            sql2 = "SELECT DISTINCT source FROM opusfile"
+            if where2_clause:
+                sql2 += " WHERE " + where2_clause
+
+            # Combine UNION query and parameters
+            sql_command = sql1 + " UNION " + sql2
+            params_list = where1_values + where2_values
+
+            _, rows = self.run_query(sql_command, params_list)
+            return [row[0] for row in rows]
 
-    def get_corpora(self, parameters):
+        # Case 2: no 'source' filter → basic DISTINCT source with other filters
+        where_clause, values = self._build_where(base_params)
+        sql_command = "SELECT DISTINCT source FROM opusfile"
+        if where_clause:
+            sql_command += " WHERE " + where_clause
+
+        _, rows = self.run_query(sql_command, values)
+        return [row[0] for row in rows]
+
+    def get_corpora(self, parameters: Dict[str, str]) -> List[Dict[str, Any]]:
+        # Work on a copy
+        parameters = dict(parameters)
         parameters = self.convert_latest(parameters)
 
-        a_parameters = parameters.copy()
-        preprocessing = parameters.get('preprocessing')
+        a_parameters = dict(parameters)
+        preprocessing = parameters.get("preprocessing")
         suffix = ""
-        if preprocessing in ['xml', 'raw', 'parsed']:
+
+        if preprocessing in ["xml", "raw", "parsed"]:
             # Get xml alignment files
-            a_parameters['preprocessing'] = 'xml'
+            a_parameters["preprocessing"] = "xml"
             # Don't get the sentence file
-            suffix=' AND target != ""'
+            suffix = ' AND target != ""'
+
         ret = self.run_default_query(a_parameters, suffix=suffix)
-        source = parameters.get('source')
-        target = parameters.get('target')
+
+        source = parameters.get("source")
+        target = parameters.get("target")
+
         if source and not target:
             # Get items where the queried language is on the target side
-            a_parameters['target'] = parameters['source']
-            del a_parameters['source']
-            ret = self.run_default_query(a_parameters, suffix=" AND source != ''") + ret
+            a_parameters = dict(parameters)
+            a_parameters["target"] = parameters["source"]
+            a_parameters.pop("source", None)
+            ret = (
+                self.run_default_query(a_parameters, suffix=" AND source != ''")
+                + ret
+            )
 
-        if preprocessing in ['xml', 'raw', 'parsed']:
+        if preprocessing in ["xml", "raw", "parsed"]:
             # Get sentence files
             languages = set()
             for item in ret:
-                languages.add(item['source'])
-                languages.add(item['target'])
+                languages.add(item.get("source", ""))
+                languages.add(item.get("target", ""))
 
-            parameters['target'] = ''
-            for language in sorted(list(languages)):
-                parameters['source'] = language
-                ret = ret + self.run_default_query(parameters)
+            parameters["target"] = ""
+            for language in sorted(languages):
+                if not language:
+                    continue
+                parameters["source"] = language
+                ret.extend(self.run_default_query(parameters))
 
         return ret
+
+    # -------------------------
+    # Optional: index helper
+    # -------------------------
+
+    def ensure_indexes(self) -> None:
+        """
+        Create indexes that match the query patterns used in this class.
+        Call this once (e.g. from a migration or admin script).
+        """
+        self.conn.executescript(
+            """
+            CREATE INDEX IF NOT EXISTS idx_opusfile_src_trg_pre_latest
+                ON opusfile(source, target, preprocessing, latest);
+
+            CREATE INDEX IF NOT EXISTS idx_opusfile_corpus_latest_pre
+                ON opusfile(corpus, latest, preprocessing);
+
+            CREATE INDEX IF NOT EXISTS idx_opusfile_source_pre_latest
+                ON opusfile(source, preprocessing, latest);
+
+            CREATE INDEX IF NOT EXISTS idx_opusfile_target_pre_latest
+                ON opusfile(target, preprocessing, latest);
+            """
+        )
+        # Update statistics so the query planner can use the new indexes optimally
+        self.conn.execute("ANALYZE;")
+        self.conn.commit()

From 2bfa178e9fa1df8adbe229542c25ae16fccd0acd Mon Sep 17 00:00:00 2001
From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com>
Date: Tue, 9 Dec 2025 16:16:36 +0100
Subject: [PATCH 2/4] Refactor string quotes and enhance SQL statements

Updated string quotes from single to double for consistency and improved SQL statements for better readability.
---
 opustools_pkg/opustools/readopusdata.py | 415 +++++++++++++++++-------
 1 file changed, 290 insertions(+), 125 deletions(-)

diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py
index 84a831b..cdda913 100644
--- a/opustools_pkg/opustools/readopusdata.py
+++ b/opustools_pkg/opustools/readopusdata.py
@@ -11,227 +11,392 @@
 
 
 def read_url(url):
-    return urllib.request.urlopen(url).read().decode('utf-8').split('\n')
+    return urllib.request.urlopen(url).read().decode("utf-8").split("\n")
 
 
 def read_url_yaml(url, yaml):
     try:
-        raw = urllib.request.urlopen(url).read().decode('utf-8')
+        raw = urllib.request.urlopen(url).read().decode("utf-8")
     except urllib.error.HTTPError:
-        gzbytes = urllib.request.urlopen(url+'.gz').read()
-        raw = gzip.decompress(gzbytes).decode('utf-8')
+        gzbytes = urllib.request.urlopen(url + ".gz").read()
+        raw = gzip.decompress(gzbytes).decode("utf-8")
     data = yaml.load(raw)
     return data
 
 
 def create_table(cur):
-    create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile (
-    id integer PRIMARY KEY,
-    source text,
-    target text,
-    corpus text,
-    preprocessing text,
-    version text,
-    url text,
-    size integer,
-    documents integer,
-    alignment_pairs integer,
-    source_tokens integer,
-    target_tokens integer,
-    latest text,
-    updated integer
-    );'''
+    create_opusfile_table = """
+    CREATE TABLE IF NOT EXISTS opusfile (
+        id INTEGER PRIMARY KEY,
+        source TEXT,
+        target TEXT,
+        corpus TEXT,
+        preprocessing TEXT,
+        version TEXT,
+        url TEXT NOT NULL UNIQUE,
+        size INTEGER,
+        documents INTEGER,
+        alignment_pairs INTEGER,
+        source_tokens INTEGER,
+        target_tokens INTEGER,
+        latest TEXT,
+        updated INTEGER
+    );
+    """
     cur.execute(create_opusfile_table)
-    create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)'
-    cur.execute(create_url_index)
+
+    # Index to speed up DELETE ... WHERE updated=0
+    cur.execute(
+        "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)"
+    )
+
+     cur.execute(
+    \"\"\"
+       CREATE INDEX IF NOT EXISTS idx_opusfile_6cols
+       ON opusfile (source, target, corpus, preprocessing, version, url)
+        \"\"\"
+    )
 
 
 def execute_sql(cur, opusfile):
-    columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest']
-    #wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)]
-    #sql = f'SELECT * FROM opusfile WHERE {" AND ".join(wheres)}'
-    sql = f'SELECT * FROM opusfile WHERE url="{opusfile[5]}"'
-    res = cur.execute(sql).fetchall()
-    if len(res) == 1:
-        sets = [f'{columns[i]}="{opusfile[i]}"' for i in range(6, len(columns))]
-        sql = f'UPDATE opusfile SET {", ".join(sets)}, updated=1 WHERE id={res[0][0]}'
-        cur.execute(sql)
-    elif len(res) == 0:
-        sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)'
-        cur.execute(sql, opusfile)
+    """
+    Insert or update a row based on the UNIQUE 'url' column.
+
+    opusfile is a tuple:
+        (source, target, corpus, preprocessing, version,
+         url, size, documents, alignment_pairs,
+         source_tokens, target_tokens, latest)
+    """
+    columns = [
+        "source",
+        "target",
+        "corpus",
+        "preprocessing",
+        "version",
+        "url",
+        "size",
+        "documents",
+        "alignment_pairs",
+        "source_tokens",
+        "target_tokens",
+        "latest",
+    ]
+
+    # UPSERT on url: if url already exists, update fields and set updated=1
+    sql = f"""
+    INSERT INTO opusfile ({", ".join(columns)}, updated)
+    VALUES ({",".join(["?"] * (len(columns) + 1))})
+    ON CONFLICT(url) DO UPDATE SET
+        source          = excluded.source,
+        target          = excluded.target,
+        corpus          = excluded.corpus,
+        preprocessing   = excluded.preprocessing,
+        version         = excluded.version,
+        size            = excluded.size,
+        documents       = excluded.documents,
+        alignment_pairs = excluded.alignment_pairs,
+        source_tokens   = excluded.source_tokens,
+        target_tokens   = excluded.target_tokens,
+        latest          = excluded.latest,
+        updated         = 1
+    """
+    # Append updated=1 to the values
+    cur.execute(sql, (*opusfile, 1))
 
 
 def get_lang_info(name, data, data_type, info):
-    source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', ''
+    source, target, documents, alignment_pairs, source_tokens, target_tokens = (
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    )
     source = name
-    if data_type in ['bitexts', 'moses', 'tmx']:
-        names = name.split('-')
+    if data_type in ["bitexts", "moses", "tmx"]:
+        names = name.split("-")
         if len(names) != 2:
-            logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes')
+            logger.warning(
+                f'{info} {data_type} {name}: cannot split name "{name}" into two language codes'
+            )
         else:
             source, target = names
-    documents = ''
-    if data_type in ['bitexts', 'monolingual']:
-        documents = data.get('files', '')
-        if documents == '':
+    documents = ""
+    if data_type in ["bitexts", "monolingual"]:
+        documents = data.get("files", "")
+        if documents == "":
             logger.warning(f'{info} {data_type} {name} is missing "files"')
-    if data_type in ['bitexts', 'moses']:
-        alignment_pairs = data.get('alignments', '')
-        if alignment_pairs == '':
+    if data_type in ["bitexts", "moses"]:
+        alignment_pairs = data.get("alignments", "")
+        if alignment_pairs == "":
             logger.warning(f'{info} {data_type} {name} is missing "alignments"')
-    elif data_type == 'tmx':
-        alignment_pairs = data.get('translation units', '')
-        if alignment_pairs == '':
-            logger.warning(f'{info} {data_type} {name} is missing "translation units"')
-    elif data_type == 'monolingual':
-        alignment_pairs = data.get('sentences', '')
-        if alignment_pairs == '':
+    elif data_type == "tmx":
+        alignment_pairs = data.get("translation units", "")
+        if alignment_pairs == "":
+            logger.warning(
+                f'{info} {data_type} {name} is missing "translation units"'
+            )
+    elif data_type == "monolingual":
+        alignment_pairs = data.get("sentences", "")
+        if alignment_pairs == "":
             logger.warning(f'{info} {data_type} {name} is missing "sentences"')
-    if data_type == 'monolingual':
-        source_tokens = data.get('tokens', '')
-        if source_tokens == '':
+    if data_type == "monolingual":
+        source_tokens = data.get("tokens", "")
+        if source_tokens == "":
             logger.warning(f'{info} {data_type} {name} is missing "tokens"')
-        target_tokens = ''
+        target_tokens = ""
     else:
-        source_tokens = data.get('source language tokens', '')
-        if source_tokens == '':
-            logger.warning(f'{info} {data_type} {name} is missing "source language tokens"')
-        target_tokens = data.get('target language tokens', '')
-        if target_tokens == '':
-            logger.warning(f'{info} {data_type} {name} is missing "target language tokens"')
+        source_tokens = data.get("source language tokens", "")
+        if source_tokens == "":
+            logger.warning(
+                f'{info} {data_type} {name} is missing "source language tokens"'
+            )
+        target_tokens = data.get("target language tokens", "")
+        if target_tokens == "":
+            logger.warning(
+                f'{info} {data_type} {name} is missing "target language tokens"'
+            )
 
     return source, target, documents, alignment_pairs, source_tokens, target_tokens
 
 
 def get_size_url_prep(name, data, data_type, info):
-    size, url, preprocessing = '','',''
-    if data_type in ['tmx', 'moses']:
-        size = data.get('download size', '')
-        if size == '':
-            logger.warning(f'{info} {data_type} {name} is missing "download size"')
+    size, url, preprocessing = "", "", ""
+    if data_type in ["tmx", "moses"]:
+        size = data.get("download size", "")
+        if size == "":
+            logger.warning(
+                f'{info} {data_type} {name} is missing "download size"'
+            )
         else:
-            size = int(int(size)/1024)
-        url = data.get('download url', '')
-        if url == '':
-            logger.warning(f'{info} {data_type} {name} is missing "download url"')
-    elif data_type in ['bitexts', 'monolingual']:
-        size = data.get('size', '')
-        if size == '':
+            size = int(int(size) / 1024)
+        url = data.get("download url", "")
+        if url == "":
+            logger.warning(
+                f'{info} {data_type} {name} is missing "download url"'
+            )
+    elif data_type in ["bitexts", "monolingual"]:
+        size = data.get("size", "")
+        if size == "":
             logger.warning(f'{info} {data_type} {name} is missing "size"')
         else:
-            size = int(int(size)/1024)
-        url = data.get('url', '')
-        if url == '':
+            size = int(int(size) / 1024)
+        url = data.get("url", "")
+        if url == "":
             logger.warning(f'{info} {data_type} {name} is missing "url"')
 
-    pre_step = url.split('/')
+    pre_step = url.split("/")
     if len(pre_step) < 2:
-        logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"')
+        logger.warning(
+            f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"'
+        )
     else:
         preprocessing = pre_step[-2]
 
     return size, url, preprocessing
 
+
 def get_tmx_entries(corpus, version, latest, tmx, cur, info):
     for item in tmx:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, tmx[item], 'tmx', info)
-        size, url, preprocessing = get_size_url_prep(item, tmx[item], 'tmx', info)
-        opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
+            get_lang_info(item, tmx[item], "tmx", info)
+        )
+        size, url, preprocessing = get_size_url_prep(
+            item, tmx[item], "tmx", info
+        )
+        opusfile = (
+            source,
+            target,
+            corpus,
+            preprocessing,
+            version,
+            url,
+            size,
+            documents,
+            alignment_pairs,
+            source_tokens,
+            target_tokens,
+            latest,
+        )
         execute_sql(cur, opusfile)
 
 
 def get_moses_entries(corpus, version, latest, moses, cur, info):
     for item in moses:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info)
-        size, url, preprocessing = get_size_url_prep(item, moses[item], 'moses', info)
-        opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
+            get_lang_info(item, moses[item], "moses", info)
+        )
+        size, url, preprocessing = get_size_url_prep(
+            item, moses[item], "moses", info
+        )
+        opusfile = (
+            source,
+            target,
+            corpus,
+            preprocessing,
+            version,
+            url,
+            size,
+            documents,
+            alignment_pairs,
+            source_tokens,
+            target_tokens,
+            latest,
+        )
         execute_sql(cur, opusfile)
 
 
-def get_monolingual_entries(corpus, version, latest, monolingual, cur, info):
+def get_monolingual_entries(
+    corpus, version, latest, monolingual, cur, info
+):
     for item in monolingual:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info)
-        for entry in monolingual[item]['downloads'].items():
-            size, url, preprocessing = get_size_url_prep(item, entry[1], 'monolingual', info)
-            opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
+            get_lang_info(item, monolingual[item], "monolingual", info)
+        )
+        for entry in monolingual[item]["downloads"].items():
+            size, url, preprocessing = get_size_url_prep(
+                item, entry[1], "monolingual", info
+            )
+            opusfile = (
+                source,
+                target,
+                corpus,
+                preprocessing,
+                version,
+                url,
+                size,
+                documents,
+                alignment_pairs,
+                source_tokens,
+                target_tokens,
+                latest,
+            )
             execute_sql(cur, opusfile)
 
 
 def get_bitext_entries(corpus, version, latest, bitexts, cur, info):
     for item in bitexts:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info)
-        for entry in bitexts[item]['downloads'].items():
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
+            get_lang_info(item, bitexts[item], "bitexts", info)
+        )
+        for entry in bitexts[item]["downloads"].items():
             # exclude monolingual files, they are added in the monolingual phase
-            if 'language' not in entry[0]:
-                size, url, preprocessing = get_size_url_prep(item, entry[1], 'bitexts', info)
-                opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
+            if "language" not in entry[0]:
+                size, url, preprocessing = get_size_url_prep(
+                    item, entry[1], "bitexts", info
+                )
+                opusfile = (
+                    source,
+                    target,
+                    corpus,
+                    preprocessing,
+                    version,
+                    url,
+                    size,
+                    documents,
+                    alignment_pairs,
+                    source_tokens,
+                    target_tokens,
+                    latest,
+                )
                 execute_sql(cur, opusfile)
 
 
 def remove_missing_items(cur):
-    sql = 'DELETE FROM opusfile WHERE updated=0'
+    # First remove rows not touched in this run
+    sql = "DELETE FROM opusfile WHERE updated=0"
     cur.execute(sql)
-    sql = 'UPDATE opusfile SET updated=0'
+    # Then mark all remaining rows as not updated, ready for the next run
+    sql = "UPDATE opusfile SET updated=0"
     cur.execute(sql)
 
 
-def update_db(db_file=None, log_type='errors'):
+def update_db(db_file=None, log_type="errors"):
     yaml = YAML()
 
-    if log_type == 'warnings':
-        logging.basicConfig(filename='opusdb_update_error.log', level=logging.WARNING,
-                format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X')
+    if log_type == "warnings":
+        logging.basicConfig(
+            filename="opusdb_update_error.log",
+            level=logging.WARNING,
+            format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
+            datefmt="%x %X",
+        )
     else:
-        logging.basicConfig(filename='opusdb_update_error.log', level=logging.ERROR,
-                format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X')
+        logging.basicConfig(
+            filename="opusdb_update_error.log",
+            level=logging.ERROR,
+            format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
+            datefmt="%x %X",
+        )
 
     if not db_file:
-        db_file = os.path.join(os.path.dirname(__file__), 'opusdata.db')
+        db_file = os.path.join(os.path.dirname(__file__), "opusdata.db")
 
     con = sqlite3.connect(db_file)
     cur = con.cursor()
 
+    # Optional pragmas for faster bulk updates; tweak as you see fit
+    cur.execute("PRAGMA journal_mode=WAL;")
+    cur.execute("PRAGMA synchronous=NORMAL;")
+    cur.execute("PRAGMA temp_store=MEMORY;")
+    # Negative value means size in KB; here ~200MB cache
+    cur.execute("PRAGMA cache_size=-200000;")
+
     create_table(cur)
 
-    URL_BASE = 'https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/'
-    index_info = read_url(URL_BASE + 'index-info.txt')
+    URL_BASE = "https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/"
+    index_info = read_url(URL_BASE + "index-info.txt")
+
+    corpus = None
+    latest_v = None
 
     for info in index_info:
-        info_s = info.split('/')
+        info_s = info.split("/")
         if len(info_s) == 2:
             try:
                 gen_info = read_url_yaml(URL_BASE + info, yaml)
-            except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
-                logger.error(f'{info}, {type(e).__name__}: {e}')
+            except (
+                scanner.ScannerError,
+                urllib.error.HTTPError,
+                reader.ReaderError,
+            ) as e:
+                logger.error(f"{info}, {type(e).__name__}: {e}")
                 gen_info = {}
-            corpus = gen_info.get('name')
+            corpus = gen_info.get("name")
             if not corpus:
-                logger.warning(f'{info}, corpus name missing')
-            print(f'Processing corpus {corpus}')
-            latest_v = gen_info.get('latest_release')
+                logger.warning(f"{info}, corpus name missing")
+            print(f"Processing corpus {corpus}")
+            latest_v = gen_info.get("latest_release")
             if not latest_v:
-                logger.error(f'{info}, latest_release missing')
+                logger.error(f"{info}, latest_release missing")
         elif len(info_s) == 3:
             version = info_s[1]
             if not corpus:
                 corpus = info_s[0]
-            latest = 'False'
+            latest = "False"
             if version == latest_v:
-                latest = 'True'
-            stats = info.replace('info.yaml', 'statistics.yaml')
+                latest = "True"
+            stats = info.replace("info.yaml", "statistics.yaml")
             try:
                 corpus_data = read_url_yaml(URL_BASE + stats, yaml)
-            except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
-                logger.error(f'{stats}, {type(e).__name__}: {e}')
+            except (
+                scanner.ScannerError,
+                urllib.error.HTTPError,
+                reader.ReaderError,
+            ) as e:
+                logger.error(f"{stats}, {type(e).__name__}: {e}")
                 continue
 
-            get_entries = {'bitexts': get_bitext_entries,
-                            'monolingual': get_monolingual_entries,
-                            'moses': get_moses_entries,
-                            'tmx': get_tmx_entries}
+            get_entries = {
+                "bitexts": get_bitext_entries,
+                "monolingual": get_monolingual_entries,
+                "moses": get_moses_entries,
+                "tmx": get_tmx_entries,
+            }
 
             if not corpus_data:
-                logger.error(f'{info}, corpus_data is empty')
+                logger.error(f"{info}, corpus_data is empty")
                 continue
 
             for item in get_entries.keys():
@@ -239,7 +404,7 @@ def update_db(db_file=None, log_type='errors'):
                 if sub_data:
                     get_entries[item](corpus, version, latest, sub_data, cur, info)
                 else:
-                    logger.warning(f'{info}, {item} data missing')
+                    logger.warning(f"{info}, {item} data missing")
 
     remove_missing_items(cur)
 

From 5ff082615d8017fc381c6818a70f6cf88674ff5e Mon Sep 17 00:00:00 2001
From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com>
Date: Tue, 9 Dec 2025 16:53:31 +0100
Subject: [PATCH 3/4] Create composite index idx_opusfile_6cols

Added a composite index for multi-column lookups in the opusfile table.
---
 opustools_pkg/opustools/readopusdata.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py
index cdda913..9510a98 100644
--- a/opustools_pkg/opustools/readopusdata.py
+++ b/opustools_pkg/opustools/readopusdata.py
@@ -50,11 +50,13 @@ def create_table(cur):
         "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)"
     )
 
-     cur.execute(
-    \"\"\"
-       CREATE INDEX IF NOT EXISTS idx_opusfile_6cols
-       ON opusfile (source, target, corpus, preprocessing, version, url)
-        \"\"\"
+    # Composite index for multi-column lookups:
+    # WHERE source=? AND target=? AND corpus=? AND preprocessing=? AND version=? AND url=?
+    cur.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_opusfile_6cols
+        ON opusfile (source, target, corpus, preprocessing, version, url)
+        """
     )
 
 

From fe49257d038183af8bfb7ee57868694ad1259673 Mon Sep 17 00:00:00 2001
From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com>
Date: Tue, 9 Dec 2025 16:56:57 +0100
Subject: [PATCH 4/4] Fix string encoding and update SQL syntax

---
 opustools_pkg/opustools/readopusdata.py | 418 +++++++-----------------
 1 file changed, 126 insertions(+), 292 deletions(-)

diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py
index 9510a98..9512e70 100644
--- a/opustools_pkg/opustools/readopusdata.py
+++ b/opustools_pkg/opustools/readopusdata.py
@@ -11,394 +11,228 @@
 
 
 def read_url(url):
-    return urllib.request.urlopen(url).read().decode("utf-8").split("\n")
+    return urllib.request.urlopen(url).read().decode('utf-8').split('\n')
 
 
 def read_url_yaml(url, yaml):
     try:
-        raw = urllib.request.urlopen(url).read().decode("utf-8")
+        raw = urllib.request.urlopen(url).read().decode('utf-8')
     except urllib.error.HTTPError:
-        gzbytes = urllib.request.urlopen(url + ".gz").read()
-        raw = gzip.decompress(gzbytes).decode("utf-8")
+        gzbytes = urllib.request.urlopen(url+'.gz').read()
+        raw = gzip.decompress(gzbytes).decode('utf-8')
     data = yaml.load(raw)
     return data
 
 
 def create_table(cur):
-    create_opusfile_table = """
-    CREATE TABLE IF NOT EXISTS opusfile (
-        id INTEGER PRIMARY KEY,
-        source TEXT,
-        target TEXT,
-        corpus TEXT,
-        preprocessing TEXT,
-        version TEXT,
-        url TEXT NOT NULL UNIQUE,
-        size INTEGER,
-        documents INTEGER,
-        alignment_pairs INTEGER,
-        source_tokens INTEGER,
-        target_tokens INTEGER,
-        latest TEXT,
-        updated INTEGER
-    );
-    """
+    create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile (
+    id integer PRIMARY KEY,
+    source text,
+    target text,
+    corpus text,
+    preprocessing text,
+    version text,
+    url text,
+    size integer,
+    documents integer,
+    alignment_pairs integer,
+    source_tokens integer,
+    target_tokens integer,
+    latest text,
+    updated integer
+    );'''
     cur.execute(create_opusfile_table)
-
-    # Index to speed up DELETE ... WHERE updated=0
-    cur.execute(
-        "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)"
-    )
-
-    # Composite index for multi-column lookups:
-    # WHERE source=? AND target=? AND corpus=? AND preprocessing=? AND version=? AND url=?
-    cur.execute(
-        """
-        CREATE INDEX IF NOT EXISTS idx_opusfile_6cols
-        ON opusfile (source, target, corpus, preprocessing, version, url)
-        """
-    )
+    create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)'
+    cur.execute(create_url_index)
+    cur.execute('CREATE INDEX IF NOT EXISTS idx_corpusdata ON opusfile(source,target,corpus,preprocessing,latest)')
 
 
 def execute_sql(cur, opusfile):
-    """
-    Insert or update a row based on the UNIQUE 'url' column.
-
-    opusfile is a tuple:
-        (source, target, corpus, preprocessing, version,
-         url, size, documents, alignment_pairs,
-         source_tokens, target_tokens, latest)
-    """
-    columns = [
-        "source",
-        "target",
-        "corpus",
-        "preprocessing",
-        "version",
-        "url",
-        "size",
-        "documents",
-        "alignment_pairs",
-        "source_tokens",
-        "target_tokens",
-        "latest",
-    ]
-
-    # UPSERT on url: if url already exists, update fields and set updated=1
-    sql = f"""
-    INSERT INTO opusfile ({", ".join(columns)}, updated)
-    VALUES ({",".join(["?"] * (len(columns) + 1))})
-    ON CONFLICT(url) DO UPDATE SET
-        source          = excluded.source,
-        target          = excluded.target,
-        corpus          = excluded.corpus,
-        preprocessing   = excluded.preprocessing,
-        version         = excluded.version,
-        size            = excluded.size,
-        documents       = excluded.documents,
-        alignment_pairs = excluded.alignment_pairs,
-        source_tokens   = excluded.source_tokens,
-        target_tokens   = excluded.target_tokens,
-        latest          = excluded.latest,
-        updated         = 1
-    """
-    # Append updated=1 to the values
-    cur.execute(sql, (*opusfile, 1))
+    columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest']
+    #wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)]
+    #sql = f'SELECT * FROM opusfile WHERE {" AND ".join(wheres)}'
+    sql = f'SELECT * FROM opusfile WHERE url="{opusfile[5]}"'
+    res = cur.execute(sql).fetchall()
+    if len(res) == 1:
+        sets = [f'{columns[i]}="{opusfile[i]}"' for i in range(6, len(columns))]
+        sql = f'UPDATE opusfile SET {", ".join(sets)}, updated=1 WHERE id={res[0][0]}'
+        cur.execute(sql)
+    elif len(res) == 0:
+        sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)'
+        cur.execute(sql, opusfile)
 
 
 def get_lang_info(name, data, data_type, info):
-    source, target, documents, alignment_pairs, source_tokens, target_tokens = (
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-    )
+    source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', ''
     source = name
-    if data_type in ["bitexts", "moses", "tmx"]:
-        names = name.split("-")
+    if data_type in ['bitexts', 'moses', 'tmx']:
+        names = name.split('-')
         if len(names) != 2:
-            logger.warning(
-                f'{info} {data_type} {name}: cannot split name "{name}" into two language codes'
-            )
+            logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes')
         else:
             source, target = names
-    documents = ""
-    if data_type in ["bitexts", "monolingual"]:
-        documents = data.get("files", "")
-        if documents == "":
+    documents = ''
+    if data_type in ['bitexts', 'monolingual']:
+        documents = data.get('files', '')
+        if documents == '':
             logger.warning(f'{info} {data_type} {name} is missing "files"')
-    if data_type in ["bitexts", "moses"]:
-        alignment_pairs = data.get("alignments", "")
-        if alignment_pairs == "":
+    if data_type in ['bitexts', 'moses']:
+        alignment_pairs = data.get('alignments', '')
+        if alignment_pairs == '':
             logger.warning(f'{info} {data_type} {name} is missing "alignments"')
-    elif data_type == "tmx":
-        alignment_pairs = data.get("translation units", "")
-        if alignment_pairs == "":
-            logger.warning(
-                f'{info} {data_type} {name} is missing "translation units"'
-            )
-    elif data_type == "monolingual":
-        alignment_pairs = data.get("sentences", "")
-        if alignment_pairs == "":
+    elif data_type == 'tmx':
+        alignment_pairs = data.get('translation units', '')
+        if alignment_pairs == '':
+            logger.warning(f'{info} {data_type} {name} is missing "translation units"')
+    elif data_type == 'monolingual':
+        alignment_pairs = data.get('sentences', '')
+        if alignment_pairs == '':
             logger.warning(f'{info} {data_type} {name} is missing "sentences"')
-    if data_type == "monolingual":
-        source_tokens = data.get("tokens", "")
-        if source_tokens == "":
+    if data_type == 'monolingual':
+        source_tokens = data.get('tokens', '')
+        if source_tokens == '':
             logger.warning(f'{info} {data_type} {name} is missing "tokens"')
-        target_tokens = ""
+        target_tokens = ''
     else:
-        source_tokens = data.get("source language tokens", "")
-        if source_tokens == "":
-            logger.warning(
-                f'{info} {data_type} {name} is missing "source language tokens"'
-            )
-        target_tokens = data.get("target language tokens", "")
-        if target_tokens == "":
-            logger.warning(
-                f'{info} {data_type} {name} is missing "target language tokens"'
-            )
+        source_tokens = data.get('source language tokens', '')
+        if source_tokens == '':
+            logger.warning(f'{info} {data_type} {name} is missing "source language tokens"')
+        target_tokens = data.get('target language tokens', '')
+        if target_tokens == '':
+            logger.warning(f'{info} {data_type} {name} is missing "target language tokens"')
 
     return source, target, documents, alignment_pairs, source_tokens, target_tokens
 
 
 def get_size_url_prep(name, data, data_type, info):
-    size, url, preprocessing = "", "", ""
-    if data_type in ["tmx", "moses"]:
-        size = data.get("download size", "")
-        if size == "":
-            logger.warning(
-                f'{info} {data_type} {name} is missing "download size"'
-            )
+    size, url, preprocessing = '','',''
+    if data_type in ['tmx', 'moses']:
+        size = data.get('download size', '')
+        if size == '':
+            logger.warning(f'{info} {data_type} {name} is missing "download size"')
         else:
-            size = int(int(size) / 1024)
-        url = data.get("download url", "")
-        if url == "":
-            logger.warning(
-                f'{info} {data_type} {name} is missing "download url"'
-            )
-    elif data_type in ["bitexts", "monolingual"]:
-        size = data.get("size", "")
-        if size == "":
+            size = int(int(size)/1024)
+        url = data.get('download url', '')
+        if url == '':
+            logger.warning(f'{info} {data_type} {name} is missing "download url"')
+    elif data_type in ['bitexts', 'monolingual']:
+        size = data.get('size', '')
+        if size == '':
             logger.warning(f'{info} {data_type} {name} is missing "size"')
         else:
-            size = int(int(size) / 1024)
-        url = data.get("url", "")
-        if url == "":
+            size = int(int(size)/1024)
+        url = data.get('url', '')
+        if url == '':
             logger.warning(f'{info} {data_type} {name} is missing "url"')
 
-    pre_step = url.split("/")
+    pre_step = url.split('/')
     if len(pre_step) < 2:
-        logger.warning(
-            f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"'
-        )
+        logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"')
     else:
         preprocessing = pre_step[-2]
 
     return size, url, preprocessing
 
-
 def get_tmx_entries(corpus, version, latest, tmx, cur, info):
     for item in tmx:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
-            get_lang_info(item, tmx[item], "tmx", info)
-        )
-        size, url, preprocessing = get_size_url_prep(
-            item, tmx[item], "tmx", info
-        )
-        opusfile = (
-            source,
-            target,
-            corpus,
-            preprocessing,
-            version,
-            url,
-            size,
-            documents,
-            alignment_pairs,
-            source_tokens,
-            target_tokens,
-            latest,
-        )
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, tmx[item], 'tmx', info)
+        size, url, preprocessing = get_size_url_prep(item, tmx[item], 'tmx', info)
+        opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
         execute_sql(cur, opusfile)
 
 
 def get_moses_entries(corpus, version, latest, moses, cur, info):
     for item in moses:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
-            get_lang_info(item, moses[item], "moses", info)
-        )
-        size, url, preprocessing = get_size_url_prep(
-            item, moses[item], "moses", info
-        )
-        opusfile = (
-            source,
-            target,
-            corpus,
-            preprocessing,
-            version,
-            url,
-            size,
-            documents,
-            alignment_pairs,
-            source_tokens,
-            target_tokens,
-            latest,
-        )
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info)
+        size, url, preprocessing = get_size_url_prep(item, moses[item], 'moses', info)
+        opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
         execute_sql(cur, opusfile)
 
 
-def get_monolingual_entries(
-    corpus, version, latest, monolingual, cur, info
-):
+def get_monolingual_entries(corpus, version, latest, monolingual, cur, info):
     for item in monolingual:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
-            get_lang_info(item, monolingual[item], "monolingual", info)
-        )
-        for entry in monolingual[item]["downloads"].items():
-            size, url, preprocessing = get_size_url_prep(
-                item, entry[1], "monolingual", info
-            )
-            opusfile = (
-                source,
-                target,
-                corpus,
-                preprocessing,
-                version,
-                url,
-                size,
-                documents,
-                alignment_pairs,
-                source_tokens,
-                target_tokens,
-                latest,
-            )
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info)
+        for entry in monolingual[item]['downloads'].items():
+            size, url, preprocessing = get_size_url_prep(item, entry[1], 'monolingual', info)
+            opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
             execute_sql(cur, opusfile)
 
 
 def get_bitext_entries(corpus, version, latest, bitexts, cur, info):
     for item in bitexts:
-        source, target, documents, alignment_pairs, source_tokens, target_tokens = (
-            get_lang_info(item, bitexts[item], "bitexts", info)
-        )
-        for entry in bitexts[item]["downloads"].items():
+        source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info)
+        for entry in bitexts[item]['downloads'].items():
             # exclude monolingual files, they are added in the monolingual phase
-            if "language" not in entry[0]:
-                size, url, preprocessing = get_size_url_prep(
-                    item, entry[1], "bitexts", info
-                )
-                opusfile = (
-                    source,
-                    target,
-                    corpus,
-                    preprocessing,
-                    version,
-                    url,
-                    size,
-                    documents,
-                    alignment_pairs,
-                    source_tokens,
-                    target_tokens,
-                    latest,
-                )
+            if 'language' not in entry[0]:
+                size, url, preprocessing = get_size_url_prep(item, entry[1], 'bitexts', info)
+                opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
                 execute_sql(cur, opusfile)
 
 
 def remove_missing_items(cur):
-    # First remove rows not touched in this run
-    sql = "DELETE FROM opusfile WHERE updated=0"
+    sql = 'DELETE FROM opusfile WHERE updated=0'
     cur.execute(sql)
-    # Then mark all remaining rows as not updated, ready for the next run
-    sql = "UPDATE opusfile SET updated=0"
+    sql = 'UPDATE opusfile SET updated=0'
     cur.execute(sql)
 
 
-def update_db(db_file=None, log_type="errors"):
+def update_db(db_file=None, log_type='errors'):
     yaml = YAML()
 
-    if log_type == "warnings":
-        logging.basicConfig(
-            filename="opusdb_update_error.log",
-            level=logging.WARNING,
-            format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
-            datefmt="%x %X",
-        )
+    if log_type == 'warnings':
+        logging.basicConfig(filename='opusdb_update_error.log', level=logging.WARNING,
+                format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X')
     else:
-        logging.basicConfig(
-            filename="opusdb_update_error.log",
-            level=logging.ERROR,
-            format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
-            datefmt="%x %X",
-        )
+        logging.basicConfig(filename='opusdb_update_error.log', level=logging.ERROR,
+                format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X')
 
     if not db_file:
-        db_file = os.path.join(os.path.dirname(__file__), "opusdata.db")
+        db_file = os.path.join(os.path.dirname(__file__), 'opusdata.db')
 
     con = sqlite3.connect(db_file)
     cur = con.cursor()
 
-    # Optional pragmas for faster bulk updates; tweak as you see fit
-    cur.execute("PRAGMA journal_mode=WAL;")
-    cur.execute("PRAGMA synchronous=NORMAL;")
-    cur.execute("PRAGMA temp_store=MEMORY;")
-    # Negative value means size in KB; here ~200MB cache
-    cur.execute("PRAGMA cache_size=-200000;")
-
     create_table(cur)
 
-    URL_BASE = "https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/"
-    index_info = read_url(URL_BASE + "index-info.txt")
-
-    corpus = None
-    latest_v = None
+    URL_BASE = 'https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/'
+    index_info = read_url(URL_BASE + 'index-info.txt')
 
     for info in index_info:
-        info_s = info.split("/")
+        info_s = info.split('/')
         if len(info_s) == 2:
             try:
                 gen_info = read_url_yaml(URL_BASE + info, yaml)
-            except (
-                scanner.ScannerError,
-                urllib.error.HTTPError,
-                reader.ReaderError,
-            ) as e:
-                logger.error(f"{info}, {type(e).__name__}: {e}")
+            except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
+                logger.error(f'{info}, {type(e).__name__}: {e}')
                 gen_info = {}
-            corpus = gen_info.get("name")
+            corpus = gen_info.get('name')
             if not corpus:
-                logger.warning(f"{info}, corpus name missing")
-            print(f"Processing corpus {corpus}")
-            latest_v = gen_info.get("latest_release")
+                logger.warning(f'{info}, corpus name missing')
+            print(f'Processing corpus {corpus}')
+            latest_v = gen_info.get('latest_release')
             if not latest_v:
-                logger.error(f"{info}, latest_release missing")
+                logger.error(f'{info}, latest_release missing')
         elif len(info_s) == 3:
             version = info_s[1]
             if not corpus:
                 corpus = info_s[0]
-            latest = "False"
+            latest = 'False'
             if version == latest_v:
-                latest = "True"
-            stats = info.replace("info.yaml", "statistics.yaml")
+                latest = 'True'
+            stats = info.replace('info.yaml', 'statistics.yaml')
             try:
                 corpus_data = read_url_yaml(URL_BASE + stats, yaml)
-            except (
-                scanner.ScannerError,
-                urllib.error.HTTPError,
-                reader.ReaderError,
-            ) as e:
-                logger.error(f"{stats}, {type(e).__name__}: {e}")
+            except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
+                logger.error(f'{stats}, {type(e).__name__}: {e}')
                 continue
 
-            get_entries = {
-                "bitexts": get_bitext_entries,
-                "monolingual": get_monolingual_entries,
-                "moses": get_moses_entries,
-                "tmx": get_tmx_entries,
-            }
+            get_entries = {'bitexts': get_bitext_entries,
+                            'monolingual': get_monolingual_entries,
+                            'moses': get_moses_entries,
+                            'tmx': get_tmx_entries}
 
             if not corpus_data:
-                logger.error(f"{info}, corpus_data is empty")
+                logger.error(f'{info}, corpus_data is empty')
                 continue
 
             for item in get_entries.keys():
@@ -406,7 +240,7 @@ def update_db(db_file=None, log_type="errors"):
                 if sub_data:
                     get_entries[item](corpus, version, latest, sub_data, cur, info)
                 else:
-                    logger.warning(f"{info}, {item} data missing")
+                    logger.warning(f'{info}, {item} data missing')
 
     remove_missing_items(cur)