diff --git a/CC-CEDICT converted/CC-CEDICT_20171028-1.2.xdxf.zip b/CC-CEDICT converted/CC-CEDICT_20171028-1.2.xdxf.zip new file mode 100644 index 0000000..427ae00 Binary files /dev/null and b/CC-CEDICT converted/CC-CEDICT_20171028-1.2.xdxf.zip differ diff --git a/CHANGELOG.md b/CHANGELOG.md index bf10e9b..26f28ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,14 @@ # Change Log -## v. 1.0 -First Version +## v. 1.2 +* Convert three dots (...) to the actual ellipsis character (…) to avoid mistaking some words as abbreviations. +* Added abbreviations *sb* (somebody) and *sth* (something). +* Changed default output filename to `CC-CEDICT_publishingdate-converterversion`, hopefully making it clearer. ## v. 1.1 * Added arguments to define input and output files. * Added progress bars. * Added option of automatically downloading the most recent release of CC-CEDICT instead of using a pre-downloaded file. + +## v. 1.0 +First Version diff --git a/README.md b/README.md index dfcb809..995fa5f 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ CedictXML is simple tool written in Python to convert an original [CC-CEDICT](https://www.mdbg.net/chindict/chindict.php?page=cc-cedict) file to a XML dictionary file in the logical [XDXF format](https://github.com/soshial/xdxf_makedict/blob/master/format_standard/xdxf_description.md), which can be used with dictionary software that support this format. ## Screenshot -![Screenshot of XDXF CC-CEDICT running on GoldenDict 1.5](https://github.com/k-sl/CedictXML/blob/master/images/screenshot.png) +![Screenshot of XDXF CC-CEDICT openned on GoldenDict 1.5](https://github.com/k-sl/CedictXML/blob/master/images/screenshot.png) -Screenshot of XDXF CC-CEDICT running on [GoldenDict](https://github.com/goldendict/goldendict) 1.5 +Screenshot of XDXF CC-CEDICT openned on [GoldenDict](https://github.com/goldendict/goldendict) 1.5 ## Dependencies * _pinyin.py_ from the [pycedict](https://github.com/jdillworth/pycedict/) library diff --git a/TODO.md b/TODO.md index aeb5bed..613f21e 100644 --- a/TODO.md +++ b/TODO.md @@ -7,6 +7,6 @@ * Add support for Pleco's [CC-Canto](http://cantonese.org/) Cantonese dictionary (as an optional addition). * Add support for [Pleco's Cantonese readings](http://cantonese.org/download.html) (as an optional addition). (Possibly confusing in the current status of the XDXF standand.) * Make the abbreviations list of tuples an external file (possibly unnecessary). -* Add not yet recognized abbreviations: "sb", "hon". +* Add not yet recognized abbreviations such as "hon". * Recognize two internal references in the format "See also: " * Decide and implement a format for internal references. E.g. 漢字|汉子 hanzi (if alternative writing systems and transliterations aren't added to the XDXF format.) diff --git a/XDXF CC-CEDICT/CC-CEDICT_20171028-1.2.xdxf.zip b/XDXF CC-CEDICT/CC-CEDICT_20171028-1.2.xdxf.zip new file mode 100644 index 0000000..427ae00 Binary files /dev/null and b/XDXF CC-CEDICT/CC-CEDICT_20171028-1.2.xdxf.zip differ diff --git a/cedictxml.py b/cedictxml.py index 464c200..99c1d19 100644 --- a/cedictxml.py +++ b/cedictxml.py @@ -7,7 +7,6 @@ import argparse import zipfile import urllib2 -import zipfile import tempfile from tqdm import tqdm @@ -15,7 +14,7 @@ from pinyin import pinyinize -version = "1.1" +version = "1.2" dictionaryname = "CC-CEDICT" currenttime = time.strftime("%d-%m-%Y %H:%M:%S", time.localtime()) dtd_url = "https://raw.github.com/soshial/xdxf_makedict/master/format_standard/xdxf_strict.dtd" @@ -178,6 +177,8 @@ def dictconvert(dictionaryfile): ("Taiwan pr. [" + entry_taiwan + "]", "")) entry_taiwan = pyjoin(entry_taiwan) + # Correct three dots to ellipsis. + entry_translation = entry_translation.replace(u"...", u"…") # Correct the pinyin and separate the different translations # into a list. entry_translation = bracketpy(entry_translation) @@ -208,7 +209,7 @@ def dictconvert(dictionaryfile): publishing_date_xdxf = (publishing_date[8:] + "-" + publishing_date[5:7] + "-" + publishing_date[:5]) global dictionary_version - dictionary_version = "1." + publishing_date.replace("-","") + dictionary_version = publishing_date.replace("-","") + "-" + version return cedict_dict def createxdxf(dictionary): @@ -255,7 +256,8 @@ def createxdxf(dictionary): ("telecom.", "telecommunications", "knl"), ("trad.", "traditional(ly)","stl"), ("translit.", "transliteration", "aux"), ("usu.", "usually", "aux"), ("zool.", "zoology", - "knl"), ("zoolog.", "zoology", "knl")] + "knl"), ("zoolog.", "zoology", "knl"), ("sth", "something", + "aux"), ("sb", "somebody", "aux")] abbrlist = [] for tupple in abbreviations: abbrlist.append(tupple[0]) @@ -329,7 +331,7 @@ def createxdxf(dictionary): lexicon_ar_def_def = ET.SubElement(lexicon_ar_def, "def") # Recognize the abbreviations. for abbreviation in abbrlist: - abbreviation_re = r"\b(" + re.escape(abbreviation) + r")\W" + abbreviation_re = r"\b(" + re.escape(abbreviation) + r")\W|\b(" + re.escape(abbreviation) + r")$" if len(re.findall(abbreviation_re,translation)) > 0: translation = (translation. replace(abbreviation, "_lt_abbr_mt_" +