From 35bf42b83b5911e7778e3ed0d0837dc9d12537f9 Mon Sep 17 00:00:00 2001 From: David Roher Date: Tue, 5 Dec 2023 09:10:17 -0500 Subject: [PATCH] Dec 2023 Update --- .gitignore | 4 ++++ README.md | 4 ++-- main.py | 7 ++++--- requirements.txt | 6 +++--- templates.py | 2 +- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 894a44c..30be760 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,7 @@ venv.bak/ # mypy .mypy_cache/ + +etymology.csv +etymology.csv.gz +etymology.parquet \ No newline at end of file diff --git a/README.md b/README.md index c40eec1..dbea2b8 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # etymology-db -**Downloads:** (Last generated 2021-11-14) +**Downloads:** (Last generated 2023-12-05) [**Gzipped CSV**](https://1drv.ms/u/s!AtpEocFNRNBWhAe7co0JFvac-OfA?e=wnJe4r) [**Parquet**](https://1drv.ms/u/s!AtpEocFNRNBWhhP6w5D9XfdtPH9I?e=jWRwnI) A structured, comprehensive, and multilingual etymology dataset created by parsing Wiktionary's etymology sections. Key features: -* 3.8+ million etymological relationships between 1.8+ million terms in 2900+ languages/dialects +* 4.2+ million etymological relationships between 2.0+ million terms in 3300+ languages/dialects * 31 different types of etymological relations, distinguishing between inheritance, borrowing, etc. * Hierarchical data that preserves relationship structures, such as the evolution of a term across languages diff --git a/main.py b/main.py index e906fb4..eedea4c 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import csv import logging import re -from multiprocessing import Pool +from multiprocessing import Pool, freeze_support from datetime import datetime, timedelta from pathlib import Path from typing import Generator, List, Tuple @@ -64,8 +64,9 @@ def write_all(): elapsed = (datetime.now() - time) if elapsed.total_seconds() > 1: elapsed -= timedelta(microseconds=elapsed.microseconds) - print(f"Entries parsed: {entries_parsed} Time elapsed: {elapsed} " - f"Entries per second: {entries_parsed // elapsed.total_seconds()}{' ' * 10}", end="\r", flush=True) + if entries_parsed % 1000 == 0: + print(f"Entries parsed: {entries_parsed} Time elapsed: {elapsed} " + f"Entries per second: {entries_parsed // elapsed.total_seconds()}{' ' * 10}", end="\r", flush=True) def stream_terms() -> Generator[Tuple[str, str], None, None]: diff --git a/requirements.txt b/requirements.txt index edc07d1..44dda22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -lxml==4.9.1 -requests==2.26.0 -mwparserfromhell==0.6.3 +lxml==4.9.3 +requests==2.31.0 +mwparserfromhell==0.6.5 diff --git a/templates.py b/templates.py index 8e1b32f..e2a4bb5 100644 --- a/templates.py +++ b/templates.py @@ -8,7 +8,7 @@ from elements import Etymology -unparsed_templates = Manager().dict() +unparsed_templates = dict() class RelType(Enum): Inherited = "inherited_from"