From 132377dc537924150f1129c355ed44f4cf110aae Mon Sep 17 00:00:00 2001 From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com> Date: Thu, 4 Dec 2025 12:23:58 +0100 Subject: [PATCH 1/4] Update db_operations.py --- opustools_pkg/opustools/db_operations.py | 385 +++++++++++++++++------ 1 file changed, 292 insertions(+), 93 deletions(-) diff --git a/opustools_pkg/opustools/db_operations.py b/opustools_pkg/opustools/db_operations.py index 5060d5c..be8fbe9 100644 --- a/opustools_pkg/opustools/db_operations.py +++ b/opustools_pkg/opustools/db_operations.py @@ -1,136 +1,335 @@ import os import sqlite3 +from typing import Any, Dict, List, Tuple, Optional + class DbOperations: + def __init__(self, db_file: Optional[str] = None) -> None: + # Resolve DB path + self.db_file = db_file or os.environ.get("OPUSAPI_DB") + if not self.db_file: + raise ValueError("OPUSAPI_DB environment variable or db_file must be set") + + # Single, reusable connection + # check_same_thread=False allows usage from different threads if your app does that + self.conn = sqlite3.connect(self.db_file, check_same_thread=False) + self.conn.row_factory = sqlite3.Row + + # Performance/safety PRAGMAs – run once per connection + self._configure_connection() + + def _configure_connection(self) -> None: + # WAL improves concurrency and often speed + self.conn.execute("PRAGMA journal_mode = WAL;") + # Good balance of durability and performance + self.conn.execute("PRAGMA synchronous = NORMAL;") + # Keep temp structures in memory + self.conn.execute("PRAGMA temp_store = MEMORY;") + # Enforce FK constraints + self.conn.execute("PRAGMA foreign_keys = ON;") + + def close(self) -> None: + if getattr(self, "conn", None) is not None: + self.conn.close() + self.conn = None + + def __enter__(self) -> "DbOperations": + return self - def __init__(self, db_file=None): - if db_file: - self.db_file = db_file - else: - self.db_file = os.environ.get('OPUSAPI_DB') + def __exit__(self, exc_type, exc, tb) -> None: + self.close() - def clean_up_parameters(self, parameters): - remove = [] - valid_keys = ['corpus', 'id', 'latest', 'preprocessing', 'source', 'target', 'version', 'corpora', 'languages'] + def __del__(self) -> None: + # Best-effort cleanup + try: + self.close() + except Exception: + pass + + # ------------------------- + # Utility helpers + # ------------------------- + + def clean_up_parameters(self, parameters: Dict[str, str]) -> Dict[str, str]: + """ + Keep only allowed keys, strip quote characters from values. + """ + remove: List[str] = [] + valid_keys = [ + "corpus", + "id", + "latest", + "preprocessing", + "source", + "target", + "version", + "corpora", + "languages", + ] for key, value in parameters.items(): if key not in valid_keys: remove.append(key) continue - value = value.replace('"', '') - value = value.replace('\'', '') + # These replacements are mostly unnecessary with parameterized queries, + # but kept for backward-compatibility. + value = value.replace('"', "") + value = value.replace("'", "") parameters[key] = value for key in remove: del parameters[key] return parameters - def run_query(self, sql_command): - conn = sqlite3.connect(self.db_file) - query = conn.execute(sql_command) - value_list = query.fetchall() - keys = [i[0] for i in query.description] - conn.close() - return keys, value_list + def _build_where( + self, parameters: Dict[str, Any] + ) -> Tuple[str, List[Any]]: + """ + Build a WHERE clause and parameter list from a dict like {"source": "en", "target": "de"}. + Returns (clause_string, values_list), e.g. ("source = ? AND target = ?", ["en", "de"]) + """ + clauses: List[str] = [] + values: List[Any] = [] + for k, v in parameters.items(): + clauses.append(f"{k} = ?") + values.append(v) + return " AND ".join(clauses), values + + def run_query( + self, sql_command: str, params: Optional[List[Any]] = None + ) -> Tuple[List[str], List[sqlite3.Row]]: + """ + Execute a query and return (column_names, list_of_rows). + """ + cur = self.conn.execute(sql_command, params or []) + rows = cur.fetchall() + keys = [d[0] for d in cur.description] + return keys, rows + + # ------------------------- + # Public query helpers + # ------------------------- + + def sort_source_target(self, parameters: Dict[str, str]) -> Dict[str, str]: + source = parameters.get("source") + target = parameters.get("target") + if source and target: + sou_tar = sorted([source, target]) + parameters["source"] = sou_tar[0] + parameters["target"] = sou_tar[1] + return parameters + + def convert_latest(self, parameters: Dict[str, str]) -> Dict[str, str]: + version = parameters.get("version") + if version and version == "latest": + parameters["latest"] = "True" + del parameters["version"] + return parameters + + def run_default_query( + self, parameters: Dict[str, str], suffix: str = "" + ) -> List[Dict[str, Any]]: + columns = [ + "alignment_pairs", + "corpus", + "documents", + "id", + "latest", + "preprocessing", + "size", + "source", + "source_tokens", + "target", + "target_tokens", + "url", + "version", + ] - def run_default_query(self, parameters, suffix=''): - columns = ['alignment_pairs', 'corpus', 'documents', 'id', 'latest', 'preprocessing', 'size', 'source', 'source_tokens', 'target', 'target_tokens', 'url', 'version'] + # We mutate parameters in this method, so work on a local copy + parameters = dict(parameters) parameters = self.sort_source_target(parameters) - sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + suffix - keys, value_list = self.run_query(sql_command) - ret = [{k: v for k, v in zip(keys,values)} for values in value_list] - if 'preprocessing' not in parameters.keys() and parameters.get('target'): - param_mono_src = parameters.copy() - param_mono_src['target'] = '' - sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_src.items()]) + suffix - keys, value_list = self.run_query(sql_command) - ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list] - - param_mono_trg = parameters.copy() - param_mono_trg['source'] = parameters['target'] - param_mono_trg['target'] = '' - sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_trg.items()]) + suffix - keys, value_list = self.run_query(sql_command) - ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list] + + where_clause, values = self._build_where(parameters) + + sql_command = f"SELECT {', '.join(columns)} FROM opusfile" + if where_clause: + sql_command += " WHERE " + where_clause + if suffix: + # suffix usually like " AND target != ''" or " AND source != ''" + sql_command += suffix + + keys, rows = self.run_query(sql_command, values) + ret = [{k: row[idx] for idx, k in enumerate(keys)} for row in rows] + + # Monolingual fetches when target is present but preprocessing not specified + if "preprocessing" not in parameters.keys() and parameters.get("target"): + # 1) Mono source: target = '' + param_mono_src = dict(parameters) + param_mono_src["target"] = "" + where_clause, values = self._build_where(param_mono_src) + sql_command = ( + f"SELECT {', '.join(columns)} FROM opusfile WHERE {where_clause}" + ) + suffix + keys, rows = self.run_query(sql_command, values) + ret.extend({k: row[idx] for idx, k in enumerate(keys)} for row in rows) + + # 2) Mono target: swap source/target, then set target = '' + param_mono_trg = dict(parameters) + param_mono_trg["source"] = parameters["target"] + param_mono_trg["target"] = "" + where_clause, values = self._build_where(param_mono_trg) + sql_command = ( + f"SELECT {', '.join(columns)} FROM opusfile WHERE {where_clause}" + ) + suffix + keys, rows = self.run_query(sql_command, values) + ret.extend({k: row[idx] for idx, k in enumerate(keys)} for row in rows) return ret - def run_corpora_query(self, parameters): + def run_corpora_query(self, parameters: Dict[str, str]) -> List[str]: + # Work on a copy to avoid side effects + parameters = dict(parameters) parameters = self.convert_latest(parameters) parameters = self.sort_source_target(parameters) - if 'corpora' in parameters.keys(): - del parameters['corpora'] + parameters.pop("corpora", None) + + sql_command = "SELECT DISTINCT corpus FROM opusfile" + where_clause, values = self._build_where(parameters) + + if where_clause: + sql_command += " WHERE " + where_clause - sql_command = 'SELECT DISTINCT corpus FROM opusfile' - if len(parameters) > 0: - sql_command = sql_command+' WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) - _, value_list = self.run_query(sql_command) - values = [v[0] for v in value_list] - return values + _, rows = self.run_query(sql_command, values) + return [row[0] for row in rows] - def run_languages_query(self, parameters): + def run_languages_query(self, parameters: Dict[str, str]) -> List[str]: + # Work on a copy to avoid mutating caller's dict + parameters = dict(parameters) parameters = self.convert_latest(parameters) parameters = self.sort_source_target(parameters) - if 'languages' in parameters.keys(): - del parameters['languages'] - - sql_command = 'SELECT DISTINCT source FROM opusfile ' - if len(parameters) > 0: - source = parameters.get('source') - if source: - sql_command = 'SELECT DISTINCT target FROM opusfile where '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + f" AND target != '{source}' AND target != '' UNION SELECT DISTINCT source FROM opusfile " - parameters['target'] = parameters['source'] - del parameters['source'] - sql_command = sql_command + 'WHERE ' - sql_command = sql_command + ' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) - _, value_list = self.run_query(sql_command) - values = [v[0] for v in value_list] - return values - - def sort_source_target(self, parameters): - source = parameters.get('source') - target = parameters.get('target') - if source and target: - sou_tar = sorted([source, target]) - parameters['source'] = sou_tar[0] - parameters['target'] = sou_tar[1] - return parameters + parameters.pop("languages", None) - def convert_latest(self, parameters): - version = parameters.get('version') - if version and version == 'latest': - parameters['latest'] = 'True' - del parameters['version'] - return parameters + # If there are no filters, just list sources + if not parameters: + sql_command = "SELECT DISTINCT source FROM opusfile" + _, rows = self.run_query(sql_command) + return [row[0] for row in rows] + + base_params = dict(parameters) + source = base_params.get("source") + + # Case 1: source specified -> UNION of target and source + if source: + # First part: DISTINCT target with filters + target != source/'' + + where1_clause_parts: List[str] = [] + where1_values: List[Any] = [] + for k, v in base_params.items(): + where1_clause_parts.append(f"{k} = ?") + where1_values.append(v) + where1_clause = " AND ".join(where1_clause_parts) + + sql1 = ( + "SELECT DISTINCT target FROM opusfile WHERE " + + where1_clause + + " AND target != ? AND target != ''" + ) + where1_values.extend([source]) + + # Second part: DISTINCT source with source/target swapped + swapped = dict(base_params) + swapped["target"] = swapped["source"] + del swapped["source"] + where2_clause, where2_values = self._build_where(swapped) + + sql2 = "SELECT DISTINCT source FROM opusfile" + if where2_clause: + sql2 += " WHERE " + where2_clause + + # Combine UNION query and parameters + sql_command = sql1 + " UNION " + sql2 + params_list = where1_values + where2_values + + _, rows = self.run_query(sql_command, params_list) + return [row[0] for row in rows] - def get_corpora(self, parameters): + # Case 2: no 'source' filter → basic DISTINCT source with other filters + where_clause, values = self._build_where(base_params) + sql_command = "SELECT DISTINCT source FROM opusfile" + if where_clause: + sql_command += " WHERE " + where_clause + + _, rows = self.run_query(sql_command, values) + return [row[0] for row in rows] + + def get_corpora(self, parameters: Dict[str, str]) -> List[Dict[str, Any]]: + # Work on a copy + parameters = dict(parameters) parameters = self.convert_latest(parameters) - a_parameters = parameters.copy() - preprocessing = parameters.get('preprocessing') + a_parameters = dict(parameters) + preprocessing = parameters.get("preprocessing") suffix = "" - if preprocessing in ['xml', 'raw', 'parsed']: + + if preprocessing in ["xml", "raw", "parsed"]: # Get xml alignment files - a_parameters['preprocessing'] = 'xml' + a_parameters["preprocessing"] = "xml" # Don't get the sentence file - suffix=' AND target != ""' + suffix = ' AND target != ""' + ret = self.run_default_query(a_parameters, suffix=suffix) - source = parameters.get('source') - target = parameters.get('target') + + source = parameters.get("source") + target = parameters.get("target") + if source and not target: # Get items where the queried language is on the target side - a_parameters['target'] = parameters['source'] - del a_parameters['source'] - ret = self.run_default_query(a_parameters, suffix=" AND source != ''") + ret + a_parameters = dict(parameters) + a_parameters["target"] = parameters["source"] + a_parameters.pop("source", None) + ret = ( + self.run_default_query(a_parameters, suffix=" AND source != ''") + + ret + ) - if preprocessing in ['xml', 'raw', 'parsed']: + if preprocessing in ["xml", "raw", "parsed"]: # Get sentence files languages = set() for item in ret: - languages.add(item['source']) - languages.add(item['target']) + languages.add(item.get("source", "")) + languages.add(item.get("target", "")) - parameters['target'] = '' - for language in sorted(list(languages)): - parameters['source'] = language - ret = ret + self.run_default_query(parameters) + parameters["target"] = "" + for language in sorted(languages): + if not language: + continue + parameters["source"] = language + ret.extend(self.run_default_query(parameters)) return ret + + # ------------------------- + # Optional: index helper + # ------------------------- + + def ensure_indexes(self) -> None: + """ + Create indexes that match the query patterns used in this class. + Call this once (e.g. from a migration or admin script). + """ + self.conn.executescript( + """ + CREATE INDEX IF NOT EXISTS idx_opusfile_src_trg_pre_latest + ON opusfile(source, target, preprocessing, latest); + + CREATE INDEX IF NOT EXISTS idx_opusfile_corpus_latest_pre + ON opusfile(corpus, latest, preprocessing); + + CREATE INDEX IF NOT EXISTS idx_opusfile_source_pre_latest + ON opusfile(source, preprocessing, latest); + + CREATE INDEX IF NOT EXISTS idx_opusfile_target_pre_latest + ON opusfile(target, preprocessing, latest); + """ + ) + # Update statistics so the query planner can use the new indexes optimally + self.conn.execute("ANALYZE;") + self.conn.commit() From 2bfa178e9fa1df8adbe229542c25ae16fccd0acd Mon Sep 17 00:00:00 2001 From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:16:36 +0100 Subject: [PATCH 2/4] Refactor string quotes and enhance SQL statements Updated string quotes from single to double for consistency and improved SQL statements for better readability. --- opustools_pkg/opustools/readopusdata.py | 415 +++++++++++++++++------- 1 file changed, 290 insertions(+), 125 deletions(-) diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py index 84a831b..cdda913 100644 --- a/opustools_pkg/opustools/readopusdata.py +++ b/opustools_pkg/opustools/readopusdata.py @@ -11,227 +11,392 @@ def read_url(url): - return urllib.request.urlopen(url).read().decode('utf-8').split('\n') + return urllib.request.urlopen(url).read().decode("utf-8").split("\n") def read_url_yaml(url, yaml): try: - raw = urllib.request.urlopen(url).read().decode('utf-8') + raw = urllib.request.urlopen(url).read().decode("utf-8") except urllib.error.HTTPError: - gzbytes = urllib.request.urlopen(url+'.gz').read() - raw = gzip.decompress(gzbytes).decode('utf-8') + gzbytes = urllib.request.urlopen(url + ".gz").read() + raw = gzip.decompress(gzbytes).decode("utf-8") data = yaml.load(raw) return data def create_table(cur): - create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile ( - id integer PRIMARY KEY, - source text, - target text, - corpus text, - preprocessing text, - version text, - url text, - size integer, - documents integer, - alignment_pairs integer, - source_tokens integer, - target_tokens integer, - latest text, - updated integer - );''' + create_opusfile_table = """ + CREATE TABLE IF NOT EXISTS opusfile ( + id INTEGER PRIMARY KEY, + source TEXT, + target TEXT, + corpus TEXT, + preprocessing TEXT, + version TEXT, + url TEXT NOT NULL UNIQUE, + size INTEGER, + documents INTEGER, + alignment_pairs INTEGER, + source_tokens INTEGER, + target_tokens INTEGER, + latest TEXT, + updated INTEGER + ); + """ cur.execute(create_opusfile_table) - create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)' - cur.execute(create_url_index) + + # Index to speed up DELETE ... WHERE updated=0 + cur.execute( + "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)" + ) + + cur.execute( + \"\"\" + CREATE INDEX IF NOT EXISTS idx_opusfile_6cols + ON opusfile (source, target, corpus, preprocessing, version, url) + \"\"\" + ) def execute_sql(cur, opusfile): - columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest'] - #wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)] - #sql = f'SELECT * FROM opusfile WHERE {" AND ".join(wheres)}' - sql = f'SELECT * FROM opusfile WHERE url="{opusfile[5]}"' - res = cur.execute(sql).fetchall() - if len(res) == 1: - sets = [f'{columns[i]}="{opusfile[i]}"' for i in range(6, len(columns))] - sql = f'UPDATE opusfile SET {", ".join(sets)}, updated=1 WHERE id={res[0][0]}' - cur.execute(sql) - elif len(res) == 0: - sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)' - cur.execute(sql, opusfile) + """ + Insert or update a row based on the UNIQUE 'url' column. + + opusfile is a tuple: + (source, target, corpus, preprocessing, version, + url, size, documents, alignment_pairs, + source_tokens, target_tokens, latest) + """ + columns = [ + "source", + "target", + "corpus", + "preprocessing", + "version", + "url", + "size", + "documents", + "alignment_pairs", + "source_tokens", + "target_tokens", + "latest", + ] + + # UPSERT on url: if url already exists, update fields and set updated=1 + sql = f""" + INSERT INTO opusfile ({", ".join(columns)}, updated) + VALUES ({",".join(["?"] * (len(columns) + 1))}) + ON CONFLICT(url) DO UPDATE SET + source = excluded.source, + target = excluded.target, + corpus = excluded.corpus, + preprocessing = excluded.preprocessing, + version = excluded.version, + size = excluded.size, + documents = excluded.documents, + alignment_pairs = excluded.alignment_pairs, + source_tokens = excluded.source_tokens, + target_tokens = excluded.target_tokens, + latest = excluded.latest, + updated = 1 + """ + # Append updated=1 to the values + cur.execute(sql, (*opusfile, 1)) def get_lang_info(name, data, data_type, info): - source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', '' + source, target, documents, alignment_pairs, source_tokens, target_tokens = ( + "", + "", + "", + "", + "", + "", + ) source = name - if data_type in ['bitexts', 'moses', 'tmx']: - names = name.split('-') + if data_type in ["bitexts", "moses", "tmx"]: + names = name.split("-") if len(names) != 2: - logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes') + logger.warning( + f'{info} {data_type} {name}: cannot split name "{name}" into two language codes' + ) else: source, target = names - documents = '' - if data_type in ['bitexts', 'monolingual']: - documents = data.get('files', '') - if documents == '': + documents = "" + if data_type in ["bitexts", "monolingual"]: + documents = data.get("files", "") + if documents == "": logger.warning(f'{info} {data_type} {name} is missing "files"') - if data_type in ['bitexts', 'moses']: - alignment_pairs = data.get('alignments', '') - if alignment_pairs == '': + if data_type in ["bitexts", "moses"]: + alignment_pairs = data.get("alignments", "") + if alignment_pairs == "": logger.warning(f'{info} {data_type} {name} is missing "alignments"') - elif data_type == 'tmx': - alignment_pairs = data.get('translation units', '') - if alignment_pairs == '': - logger.warning(f'{info} {data_type} {name} is missing "translation units"') - elif data_type == 'monolingual': - alignment_pairs = data.get('sentences', '') - if alignment_pairs == '': + elif data_type == "tmx": + alignment_pairs = data.get("translation units", "") + if alignment_pairs == "": + logger.warning( + f'{info} {data_type} {name} is missing "translation units"' + ) + elif data_type == "monolingual": + alignment_pairs = data.get("sentences", "") + if alignment_pairs == "": logger.warning(f'{info} {data_type} {name} is missing "sentences"') - if data_type == 'monolingual': - source_tokens = data.get('tokens', '') - if source_tokens == '': + if data_type == "monolingual": + source_tokens = data.get("tokens", "") + if source_tokens == "": logger.warning(f'{info} {data_type} {name} is missing "tokens"') - target_tokens = '' + target_tokens = "" else: - source_tokens = data.get('source language tokens', '') - if source_tokens == '': - logger.warning(f'{info} {data_type} {name} is missing "source language tokens"') - target_tokens = data.get('target language tokens', '') - if target_tokens == '': - logger.warning(f'{info} {data_type} {name} is missing "target language tokens"') + source_tokens = data.get("source language tokens", "") + if source_tokens == "": + logger.warning( + f'{info} {data_type} {name} is missing "source language tokens"' + ) + target_tokens = data.get("target language tokens", "") + if target_tokens == "": + logger.warning( + f'{info} {data_type} {name} is missing "target language tokens"' + ) return source, target, documents, alignment_pairs, source_tokens, target_tokens def get_size_url_prep(name, data, data_type, info): - size, url, preprocessing = '','','' - if data_type in ['tmx', 'moses']: - size = data.get('download size', '') - if size == '': - logger.warning(f'{info} {data_type} {name} is missing "download size"') + size, url, preprocessing = "", "", "" + if data_type in ["tmx", "moses"]: + size = data.get("download size", "") + if size == "": + logger.warning( + f'{info} {data_type} {name} is missing "download size"' + ) else: - size = int(int(size)/1024) - url = data.get('download url', '') - if url == '': - logger.warning(f'{info} {data_type} {name} is missing "download url"') - elif data_type in ['bitexts', 'monolingual']: - size = data.get('size', '') - if size == '': + size = int(int(size) / 1024) + url = data.get("download url", "") + if url == "": + logger.warning( + f'{info} {data_type} {name} is missing "download url"' + ) + elif data_type in ["bitexts", "monolingual"]: + size = data.get("size", "") + if size == "": logger.warning(f'{info} {data_type} {name} is missing "size"') else: - size = int(int(size)/1024) - url = data.get('url', '') - if url == '': + size = int(int(size) / 1024) + url = data.get("url", "") + if url == "": logger.warning(f'{info} {data_type} {name} is missing "url"') - pre_step = url.split('/') + pre_step = url.split("/") if len(pre_step) < 2: - logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"') + logger.warning( + f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"' + ) else: preprocessing = pre_step[-2] return size, url, preprocessing + def get_tmx_entries(corpus, version, latest, tmx, cur, info): for item in tmx: - source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, tmx[item], 'tmx', info) - size, url, preprocessing = get_size_url_prep(item, tmx[item], 'tmx', info) - opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) + source, target, documents, alignment_pairs, source_tokens, target_tokens = ( + get_lang_info(item, tmx[item], "tmx", info) + ) + size, url, preprocessing = get_size_url_prep( + item, tmx[item], "tmx", info + ) + opusfile = ( + source, + target, + corpus, + preprocessing, + version, + url, + size, + documents, + alignment_pairs, + source_tokens, + target_tokens, + latest, + ) execute_sql(cur, opusfile) def get_moses_entries(corpus, version, latest, moses, cur, info): for item in moses: - source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info) - size, url, preprocessing = get_size_url_prep(item, moses[item], 'moses', info) - opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) + source, target, documents, alignment_pairs, source_tokens, target_tokens = ( + get_lang_info(item, moses[item], "moses", info) + ) + size, url, preprocessing = get_size_url_prep( + item, moses[item], "moses", info + ) + opusfile = ( + source, + target, + corpus, + preprocessing, + version, + url, + size, + documents, + alignment_pairs, + source_tokens, + target_tokens, + latest, + ) execute_sql(cur, opusfile) -def get_monolingual_entries(corpus, version, latest, monolingual, cur, info): +def get_monolingual_entries( + corpus, version, latest, monolingual, cur, info +): for item in monolingual: - source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info) - for entry in monolingual[item]['downloads'].items(): - size, url, preprocessing = get_size_url_prep(item, entry[1], 'monolingual', info) - opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) + source, target, documents, alignment_pairs, source_tokens, target_tokens = ( + get_lang_info(item, monolingual[item], "monolingual", info) + ) + for entry in monolingual[item]["downloads"].items(): + size, url, preprocessing = get_size_url_prep( + item, entry[1], "monolingual", info + ) + opusfile = ( + source, + target, + corpus, + preprocessing, + version, + url, + size, + documents, + alignment_pairs, + source_tokens, + target_tokens, + latest, + ) execute_sql(cur, opusfile) def get_bitext_entries(corpus, version, latest, bitexts, cur, info): for item in bitexts: - source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info) - for entry in bitexts[item]['downloads'].items(): + source, target, documents, alignment_pairs, source_tokens, target_tokens = ( + get_lang_info(item, bitexts[item], "bitexts", info) + ) + for entry in bitexts[item]["downloads"].items(): # exclude monolingual files, they are added in the monolingual phase - if 'language' not in entry[0]: - size, url, preprocessing = get_size_url_prep(item, entry[1], 'bitexts', info) - opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) + if "language" not in entry[0]: + size, url, preprocessing = get_size_url_prep( + item, entry[1], "bitexts", info + ) + opusfile = ( + source, + target, + corpus, + preprocessing, + version, + url, + size, + documents, + alignment_pairs, + source_tokens, + target_tokens, + latest, + ) execute_sql(cur, opusfile) def remove_missing_items(cur): - sql = 'DELETE FROM opusfile WHERE updated=0' + # First remove rows not touched in this run + sql = "DELETE FROM opusfile WHERE updated=0" cur.execute(sql) - sql = 'UPDATE opusfile SET updated=0' + # Then mark all remaining rows as not updated, ready for the next run + sql = "UPDATE opusfile SET updated=0" cur.execute(sql) -def update_db(db_file=None, log_type='errors'): +def update_db(db_file=None, log_type="errors"): yaml = YAML() - if log_type == 'warnings': - logging.basicConfig(filename='opusdb_update_error.log', level=logging.WARNING, - format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X') + if log_type == "warnings": + logging.basicConfig( + filename="opusdb_update_error.log", + level=logging.WARNING, + format="%(asctime)s %(levelname)s:%(name)s: %(message)s", + datefmt="%x %X", + ) else: - logging.basicConfig(filename='opusdb_update_error.log', level=logging.ERROR, - format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X') + logging.basicConfig( + filename="opusdb_update_error.log", + level=logging.ERROR, + format="%(asctime)s %(levelname)s:%(name)s: %(message)s", + datefmt="%x %X", + ) if not db_file: - db_file = os.path.join(os.path.dirname(__file__), 'opusdata.db') + db_file = os.path.join(os.path.dirname(__file__), "opusdata.db") con = sqlite3.connect(db_file) cur = con.cursor() + # Optional pragmas for faster bulk updates; tweak as you see fit + cur.execute("PRAGMA journal_mode=WAL;") + cur.execute("PRAGMA synchronous=NORMAL;") + cur.execute("PRAGMA temp_store=MEMORY;") + # Negative value means size in KB; here ~200MB cache + cur.execute("PRAGMA cache_size=-200000;") + create_table(cur) - URL_BASE = 'https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/' - index_info = read_url(URL_BASE + 'index-info.txt') + URL_BASE = "https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/" + index_info = read_url(URL_BASE + "index-info.txt") + + corpus = None + latest_v = None for info in index_info: - info_s = info.split('/') + info_s = info.split("/") if len(info_s) == 2: try: gen_info = read_url_yaml(URL_BASE + info, yaml) - except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: - logger.error(f'{info}, {type(e).__name__}: {e}') + except ( + scanner.ScannerError, + urllib.error.HTTPError, + reader.ReaderError, + ) as e: + logger.error(f"{info}, {type(e).__name__}: {e}") gen_info = {} - corpus = gen_info.get('name') + corpus = gen_info.get("name") if not corpus: - logger.warning(f'{info}, corpus name missing') - print(f'Processing corpus {corpus}') - latest_v = gen_info.get('latest_release') + logger.warning(f"{info}, corpus name missing") + print(f"Processing corpus {corpus}") + latest_v = gen_info.get("latest_release") if not latest_v: - logger.error(f'{info}, latest_release missing') + logger.error(f"{info}, latest_release missing") elif len(info_s) == 3: version = info_s[1] if not corpus: corpus = info_s[0] - latest = 'False' + latest = "False" if version == latest_v: - latest = 'True' - stats = info.replace('info.yaml', 'statistics.yaml') + latest = "True" + stats = info.replace("info.yaml", "statistics.yaml") try: corpus_data = read_url_yaml(URL_BASE + stats, yaml) - except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: - logger.error(f'{stats}, {type(e).__name__}: {e}') + except ( + scanner.ScannerError, + urllib.error.HTTPError, + reader.ReaderError, + ) as e: + logger.error(f"{stats}, {type(e).__name__}: {e}") continue - get_entries = {'bitexts': get_bitext_entries, - 'monolingual': get_monolingual_entries, - 'moses': get_moses_entries, - 'tmx': get_tmx_entries} + get_entries = { + "bitexts": get_bitext_entries, + "monolingual": get_monolingual_entries, + "moses": get_moses_entries, + "tmx": get_tmx_entries, + } if not corpus_data: - logger.error(f'{info}, corpus_data is empty') + logger.error(f"{info}, corpus_data is empty") continue for item in get_entries.keys(): @@ -239,7 +404,7 @@ def update_db(db_file=None, log_type='errors'): if sub_data: get_entries[item](corpus, version, latest, sub_data, cur, info) else: - logger.warning(f'{info}, {item} data missing') + logger.warning(f"{info}, {item} data missing") remove_missing_items(cur) From 5ff082615d8017fc381c6818a70f6cf88674ff5e Mon Sep 17 00:00:00 2001 From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:53:31 +0100 Subject: [PATCH 3/4] Create composite index idx_opusfile_6cols Added a composite index for multi-column lookups in the opusfile table. --- opustools_pkg/opustools/readopusdata.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py index cdda913..9510a98 100644 --- a/opustools_pkg/opustools/readopusdata.py +++ b/opustools_pkg/opustools/readopusdata.py @@ -50,11 +50,13 @@ def create_table(cur): "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)" ) - cur.execute( - \"\"\" - CREATE INDEX IF NOT EXISTS idx_opusfile_6cols - ON opusfile (source, target, corpus, preprocessing, version, url) - \"\"\" + # Composite index for multi-column lookups: + # WHERE source=? AND target=? AND corpus=? AND preprocessing=? AND version=? AND url=? + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_opusfile_6cols + ON opusfile (source, target, corpus, preprocessing, version, url) + """ ) From fe49257d038183af8bfb7ee57868694ad1259673 Mon Sep 17 00:00:00 2001 From: Lukas Weymann <62622663+lukasweymann@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:56:57 +0100 Subject: [PATCH 4/4] Fix string encoding and update SQL syntax --- opustools_pkg/opustools/readopusdata.py | 418 +++++++----------------- 1 file changed, 126 insertions(+), 292 deletions(-) diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py index 9510a98..9512e70 100644 --- a/opustools_pkg/opustools/readopusdata.py +++ b/opustools_pkg/opustools/readopusdata.py @@ -11,394 +11,228 @@ def read_url(url): - return urllib.request.urlopen(url).read().decode("utf-8").split("\n") + return urllib.request.urlopen(url).read().decode('utf-8').split('\n') def read_url_yaml(url, yaml): try: - raw = urllib.request.urlopen(url).read().decode("utf-8") + raw = urllib.request.urlopen(url).read().decode('utf-8') except urllib.error.HTTPError: - gzbytes = urllib.request.urlopen(url + ".gz").read() - raw = gzip.decompress(gzbytes).decode("utf-8") + gzbytes = urllib.request.urlopen(url+'.gz').read() + raw = gzip.decompress(gzbytes).decode('utf-8') data = yaml.load(raw) return data def create_table(cur): - create_opusfile_table = """ - CREATE TABLE IF NOT EXISTS opusfile ( - id INTEGER PRIMARY KEY, - source TEXT, - target TEXT, - corpus TEXT, - preprocessing TEXT, - version TEXT, - url TEXT NOT NULL UNIQUE, - size INTEGER, - documents INTEGER, - alignment_pairs INTEGER, - source_tokens INTEGER, - target_tokens INTEGER, - latest TEXT, - updated INTEGER - ); - """ + create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile ( + id integer PRIMARY KEY, + source text, + target text, + corpus text, + preprocessing text, + version text, + url text, + size integer, + documents integer, + alignment_pairs integer, + source_tokens integer, + target_tokens integer, + latest text, + updated integer + );''' cur.execute(create_opusfile_table) - - # Index to speed up DELETE ... WHERE updated=0 - cur.execute( - "CREATE INDEX IF NOT EXISTS idx_opusfile_updated ON opusfile(updated)" - ) - - # Composite index for multi-column lookups: - # WHERE source=? AND target=? AND corpus=? AND preprocessing=? AND version=? AND url=? - cur.execute( - """ - CREATE INDEX IF NOT EXISTS idx_opusfile_6cols - ON opusfile (source, target, corpus, preprocessing, version, url) - """ - ) + create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)' + cur.execute(create_url_index) + cur.execute('CREATE INDEX IF NOT EXISTS idx_corpusdata ON opusfile(source,target,corpus,preprocessing,latest)') def execute_sql(cur, opusfile): - """ - Insert or update a row based on the UNIQUE 'url' column. - - opusfile is a tuple: - (source, target, corpus, preprocessing, version, - url, size, documents, alignment_pairs, - source_tokens, target_tokens, latest) - """ - columns = [ - "source", - "target", - "corpus", - "preprocessing", - "version", - "url", - "size", - "documents", - "alignment_pairs", - "source_tokens", - "target_tokens", - "latest", - ] - - # UPSERT on url: if url already exists, update fields and set updated=1 - sql = f""" - INSERT INTO opusfile ({", ".join(columns)}, updated) - VALUES ({",".join(["?"] * (len(columns) + 1))}) - ON CONFLICT(url) DO UPDATE SET - source = excluded.source, - target = excluded.target, - corpus = excluded.corpus, - preprocessing = excluded.preprocessing, - version = excluded.version, - size = excluded.size, - documents = excluded.documents, - alignment_pairs = excluded.alignment_pairs, - source_tokens = excluded.source_tokens, - target_tokens = excluded.target_tokens, - latest = excluded.latest, - updated = 1 - """ - # Append updated=1 to the values - cur.execute(sql, (*opusfile, 1)) + columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest'] + #wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)] + #sql = f'SELECT * FROM opusfile WHERE {" AND ".join(wheres)}' + sql = f'SELECT * FROM opusfile WHERE url="{opusfile[5]}"' + res = cur.execute(sql).fetchall() + if len(res) == 1: + sets = [f'{columns[i]}="{opusfile[i]}"' for i in range(6, len(columns))] + sql = f'UPDATE opusfile SET {", ".join(sets)}, updated=1 WHERE id={res[0][0]}' + cur.execute(sql) + elif len(res) == 0: + sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)' + cur.execute(sql, opusfile) def get_lang_info(name, data, data_type, info): - source, target, documents, alignment_pairs, source_tokens, target_tokens = ( - "", - "", - "", - "", - "", - "", - ) + source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', '' source = name - if data_type in ["bitexts", "moses", "tmx"]: - names = name.split("-") + if data_type in ['bitexts', 'moses', 'tmx']: + names = name.split('-') if len(names) != 2: - logger.warning( - f'{info} {data_type} {name}: cannot split name "{name}" into two language codes' - ) + logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes') else: source, target = names - documents = "" - if data_type in ["bitexts", "monolingual"]: - documents = data.get("files", "") - if documents == "": + documents = '' + if data_type in ['bitexts', 'monolingual']: + documents = data.get('files', '') + if documents == '': logger.warning(f'{info} {data_type} {name} is missing "files"') - if data_type in ["bitexts", "moses"]: - alignment_pairs = data.get("alignments", "") - if alignment_pairs == "": + if data_type in ['bitexts', 'moses']: + alignment_pairs = data.get('alignments', '') + if alignment_pairs == '': logger.warning(f'{info} {data_type} {name} is missing "alignments"') - elif data_type == "tmx": - alignment_pairs = data.get("translation units", "") - if alignment_pairs == "": - logger.warning( - f'{info} {data_type} {name} is missing "translation units"' - ) - elif data_type == "monolingual": - alignment_pairs = data.get("sentences", "") - if alignment_pairs == "": + elif data_type == 'tmx': + alignment_pairs = data.get('translation units', '') + if alignment_pairs == '': + logger.warning(f'{info} {data_type} {name} is missing "translation units"') + elif data_type == 'monolingual': + alignment_pairs = data.get('sentences', '') + if alignment_pairs == '': logger.warning(f'{info} {data_type} {name} is missing "sentences"') - if data_type == "monolingual": - source_tokens = data.get("tokens", "") - if source_tokens == "": + if data_type == 'monolingual': + source_tokens = data.get('tokens', '') + if source_tokens == '': logger.warning(f'{info} {data_type} {name} is missing "tokens"') - target_tokens = "" + target_tokens = '' else: - source_tokens = data.get("source language tokens", "") - if source_tokens == "": - logger.warning( - f'{info} {data_type} {name} is missing "source language tokens"' - ) - target_tokens = data.get("target language tokens", "") - if target_tokens == "": - logger.warning( - f'{info} {data_type} {name} is missing "target language tokens"' - ) + source_tokens = data.get('source language tokens', '') + if source_tokens == '': + logger.warning(f'{info} {data_type} {name} is missing "source language tokens"') + target_tokens = data.get('target language tokens', '') + if target_tokens == '': + logger.warning(f'{info} {data_type} {name} is missing "target language tokens"') return source, target, documents, alignment_pairs, source_tokens, target_tokens def get_size_url_prep(name, data, data_type, info): - size, url, preprocessing = "", "", "" - if data_type in ["tmx", "moses"]: - size = data.get("download size", "") - if size == "": - logger.warning( - f'{info} {data_type} {name} is missing "download size"' - ) + size, url, preprocessing = '','','' + if data_type in ['tmx', 'moses']: + size = data.get('download size', '') + if size == '': + logger.warning(f'{info} {data_type} {name} is missing "download size"') else: - size = int(int(size) / 1024) - url = data.get("download url", "") - if url == "": - logger.warning( - f'{info} {data_type} {name} is missing "download url"' - ) - elif data_type in ["bitexts", "monolingual"]: - size = data.get("size", "") - if size == "": + size = int(int(size)/1024) + url = data.get('download url', '') + if url == '': + logger.warning(f'{info} {data_type} {name} is missing "download url"') + elif data_type in ['bitexts', 'monolingual']: + size = data.get('size', '') + if size == '': logger.warning(f'{info} {data_type} {name} is missing "size"') else: - size = int(int(size) / 1024) - url = data.get("url", "") - if url == "": + size = int(int(size)/1024) + url = data.get('url', '') + if url == '': logger.warning(f'{info} {data_type} {name} is missing "url"') - pre_step = url.split("/") + pre_step = url.split('/') if len(pre_step) < 2: - logger.warning( - f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"' - ) + logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"') else: preprocessing = pre_step[-2] return size, url, preprocessing - def get_tmx_entries(corpus, version, latest, tmx, cur, info): for item in tmx: - source, target, documents, alignment_pairs, source_tokens, target_tokens = ( - get_lang_info(item, tmx[item], "tmx", info) - ) - size, url, preprocessing = get_size_url_prep( - item, tmx[item], "tmx", info - ) - opusfile = ( - source, - target, - corpus, - preprocessing, - version, - url, - size, - documents, - alignment_pairs, - source_tokens, - target_tokens, - latest, - ) + source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, tmx[item], 'tmx', info) + size, url, preprocessing = get_size_url_prep(item, tmx[item], 'tmx', info) + opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) def get_moses_entries(corpus, version, latest, moses, cur, info): for item in moses: - source, target, documents, alignment_pairs, source_tokens, target_tokens = ( - get_lang_info(item, moses[item], "moses", info) - ) - size, url, preprocessing = get_size_url_prep( - item, moses[item], "moses", info - ) - opusfile = ( - source, - target, - corpus, - preprocessing, - version, - url, - size, - documents, - alignment_pairs, - source_tokens, - target_tokens, - latest, - ) + source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info) + size, url, preprocessing = get_size_url_prep(item, moses[item], 'moses', info) + opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) -def get_monolingual_entries( - corpus, version, latest, monolingual, cur, info -): +def get_monolingual_entries(corpus, version, latest, monolingual, cur, info): for item in monolingual: - source, target, documents, alignment_pairs, source_tokens, target_tokens = ( - get_lang_info(item, monolingual[item], "monolingual", info) - ) - for entry in monolingual[item]["downloads"].items(): - size, url, preprocessing = get_size_url_prep( - item, entry[1], "monolingual", info - ) - opusfile = ( - source, - target, - corpus, - preprocessing, - version, - url, - size, - documents, - alignment_pairs, - source_tokens, - target_tokens, - latest, - ) + source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info) + for entry in monolingual[item]['downloads'].items(): + size, url, preprocessing = get_size_url_prep(item, entry[1], 'monolingual', info) + opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) def get_bitext_entries(corpus, version, latest, bitexts, cur, info): for item in bitexts: - source, target, documents, alignment_pairs, source_tokens, target_tokens = ( - get_lang_info(item, bitexts[item], "bitexts", info) - ) - for entry in bitexts[item]["downloads"].items(): + source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info) + for entry in bitexts[item]['downloads'].items(): # exclude monolingual files, they are added in the monolingual phase - if "language" not in entry[0]: - size, url, preprocessing = get_size_url_prep( - item, entry[1], "bitexts", info - ) - opusfile = ( - source, - target, - corpus, - preprocessing, - version, - url, - size, - documents, - alignment_pairs, - source_tokens, - target_tokens, - latest, - ) + if 'language' not in entry[0]: + size, url, preprocessing = get_size_url_prep(item, entry[1], 'bitexts', info) + opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) def remove_missing_items(cur): - # First remove rows not touched in this run - sql = "DELETE FROM opusfile WHERE updated=0" + sql = 'DELETE FROM opusfile WHERE updated=0' cur.execute(sql) - # Then mark all remaining rows as not updated, ready for the next run - sql = "UPDATE opusfile SET updated=0" + sql = 'UPDATE opusfile SET updated=0' cur.execute(sql) -def update_db(db_file=None, log_type="errors"): +def update_db(db_file=None, log_type='errors'): yaml = YAML() - if log_type == "warnings": - logging.basicConfig( - filename="opusdb_update_error.log", - level=logging.WARNING, - format="%(asctime)s %(levelname)s:%(name)s: %(message)s", - datefmt="%x %X", - ) + if log_type == 'warnings': + logging.basicConfig(filename='opusdb_update_error.log', level=logging.WARNING, + format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X') else: - logging.basicConfig( - filename="opusdb_update_error.log", - level=logging.ERROR, - format="%(asctime)s %(levelname)s:%(name)s: %(message)s", - datefmt="%x %X", - ) + logging.basicConfig(filename='opusdb_update_error.log', level=logging.ERROR, + format='%(asctime)s %(levelname)s:%(name)s: %(message)s', datefmt='%x %X') if not db_file: - db_file = os.path.join(os.path.dirname(__file__), "opusdata.db") + db_file = os.path.join(os.path.dirname(__file__), 'opusdata.db') con = sqlite3.connect(db_file) cur = con.cursor() - # Optional pragmas for faster bulk updates; tweak as you see fit - cur.execute("PRAGMA journal_mode=WAL;") - cur.execute("PRAGMA synchronous=NORMAL;") - cur.execute("PRAGMA temp_store=MEMORY;") - # Negative value means size in KB; here ~200MB cache - cur.execute("PRAGMA cache_size=-200000;") - create_table(cur) - URL_BASE = "https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/" - index_info = read_url(URL_BASE + "index-info.txt") - - corpus = None - latest_v = None + URL_BASE = 'https://raw.githubusercontent.com/Helsinki-NLP/OPUS/main/corpus/' + index_info = read_url(URL_BASE + 'index-info.txt') for info in index_info: - info_s = info.split("/") + info_s = info.split('/') if len(info_s) == 2: try: gen_info = read_url_yaml(URL_BASE + info, yaml) - except ( - scanner.ScannerError, - urllib.error.HTTPError, - reader.ReaderError, - ) as e: - logger.error(f"{info}, {type(e).__name__}: {e}") + except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: + logger.error(f'{info}, {type(e).__name__}: {e}') gen_info = {} - corpus = gen_info.get("name") + corpus = gen_info.get('name') if not corpus: - logger.warning(f"{info}, corpus name missing") - print(f"Processing corpus {corpus}") - latest_v = gen_info.get("latest_release") + logger.warning(f'{info}, corpus name missing') + print(f'Processing corpus {corpus}') + latest_v = gen_info.get('latest_release') if not latest_v: - logger.error(f"{info}, latest_release missing") + logger.error(f'{info}, latest_release missing') elif len(info_s) == 3: version = info_s[1] if not corpus: corpus = info_s[0] - latest = "False" + latest = 'False' if version == latest_v: - latest = "True" - stats = info.replace("info.yaml", "statistics.yaml") + latest = 'True' + stats = info.replace('info.yaml', 'statistics.yaml') try: corpus_data = read_url_yaml(URL_BASE + stats, yaml) - except ( - scanner.ScannerError, - urllib.error.HTTPError, - reader.ReaderError, - ) as e: - logger.error(f"{stats}, {type(e).__name__}: {e}") + except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: + logger.error(f'{stats}, {type(e).__name__}: {e}') continue - get_entries = { - "bitexts": get_bitext_entries, - "monolingual": get_monolingual_entries, - "moses": get_moses_entries, - "tmx": get_tmx_entries, - } + get_entries = {'bitexts': get_bitext_entries, + 'monolingual': get_monolingual_entries, + 'moses': get_moses_entries, + 'tmx': get_tmx_entries} if not corpus_data: - logger.error(f"{info}, corpus_data is empty") + logger.error(f'{info}, corpus_data is empty') continue for item in get_entries.keys(): @@ -406,7 +240,7 @@ def update_db(db_file=None, log_type="errors"): if sub_data: get_entries[item](corpus, version, latest, sub_data, cur, info) else: - logger.warning(f"{info}, {item} data missing") + logger.warning(f'{info}, {item} data missing') remove_missing_items(cur)