Skip to content

Commit

Permalink
Code cleanup, Glottolog 4.0, Travis, release 1.1.
Browse files Browse the repository at this point in the history
  • Loading branch information
chrzyki committed Jul 4, 2019
1 parent e13a6e5 commit fdcdbf8
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 84 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ This dataset comprises 25 Hmong-Mien varieties, which were originally digitized
## Statistics


[![Build Status](https://travis-ci.org/None.svg?branch=master)](https://travis-ci.org/None)
[![Build Status](https://travis-ci.org/lexibank/chenhmongmien.svg?branch=master)](https://travis-ci.org/lexibank/chenhmongmien)
![Glottolog: 92%](https://img.shields.io/badge/Glottolog-92%25-green.svg "Glottolog: 92%")
![Concepticon: 89%](https://img.shields.io/badge/Concepticon-89%25-yellowgreen.svg "Concepticon: 89%")
![Source: 100%](https://img.shields.io/badge/Source-100%25-brightgreen.svg "Source: 100%")
Expand All @@ -27,7 +27,7 @@ This dataset comprises 25 Hmong-Mien varieties, which were originally digitized
- **Lexemes:** 21,617
- **Synonymy:** 1.01
- **Invalid lexemes:** 0
- **Tokens:** 115,375
- **Tokens:** 115,373
- **Segments:** 212 (0 BIPA errors, 0 CTLS sound class errors, 212 CLTS modified)
- **Inventory size (avg):** 67.16

Expand Down
4 changes: 2 additions & 2 deletions TRANSCRIPTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

| Segment | Occurrence | BIPA | CLTS SoundClass |
|:----------|-------------:|:-------|:------------------|
| + | 9578 |||
| + | 9577 |||
| a | 7464 |||
| ŋ | 5169 |||
| ³³ | 3929 |||
Expand All @@ -23,7 +23,7 @@
| k | 2512 |||
| ⁵³ | 2470 |||
| p | 2274 |||
| ¹³ | 2214 |||
| ¹³ | 2213 |||
| l | 1895 |||
| ʔ | 1825 |||
| ə | 1672 |||
Expand Down
8 changes: 6 additions & 2 deletions cldf/cldf-metadata.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
{
"@context": "http://www.w3.org/ns/csvw",
"aboutUrl": null,
"dc:bibliographicCitation": "Chen, Qiguang (2013) : Miao and Yao language. Beijing: Ethnic Publishing House",
"dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#Wordlist",
"dc:description": null,
"dc:identifier": "",
"dc:isVersionOf": null,
"dc:license": "https://creativecommons.org/licenses/by-nc/4.0/",
"dc:related": null,
"dc:source": "sources.bib",
"dc:title": "Hmong-Mien Varieties",
"rdf:ID": "chenhmongmien",
Expand All @@ -15,8 +19,8 @@
{
"dc:title": "environment",
"properties": {
"glottolog_version": "v3.4-1-g07a9b54",
"concepticon_version": "pyconcepticon-1.4.0-206-g1ad282b"
"glottolog_version": "v4.0",
"concepticon_version": "v2.0"
}
}
],
Expand Down
1 change: 0 additions & 1 deletion etc/orthography.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,6 @@ z z
⁵$
⁵² ⁵² +
⁵²$ ⁵²
³⁵$ ³⁵
⁵³ ⁵³ +
⁵³$ ⁵³
⁵¹ ⁵¹ +
Expand Down
122 changes: 56 additions & 66 deletions lexibank_chenhmongmien.py
Original file line number Diff line number Diff line change
@@ -1,119 +1,109 @@
# coding=utf-8
from __future__ import unicode_literals, print_function
import csv

from clldutils.path import Path
from pylexibank.dataset import NonSplittingDataset
import requests
from bs4 import BeautifulSoup
from clldutils.misc import slug
from clldutils.path import Path
from clldutils.text import strip_brackets, split_text

from pylexibank.dataset import NonSplittingDataset
from tqdm import tqdm
from collections import defaultdict
import re
import csv
import lingpy


class Dataset(NonSplittingDataset):
dir = Path(__file__).parent
id = "chenhmongmien"

def cmd_download(self, **kw):
import requests
import csv
from bs4 import BeautifulSoup

wp = requests.get('https://en.wiktionary.org/wiki/Appendix:Hmong-Mien_comparative_vocabulary_list')
wp = requests.get(
"https://en.wiktionary.org/wiki/Appendix:Hmong-Mien_comparative_vocabulary_list"
)
soup = BeautifulSoup(wp.content, "html.parser")

language_table_header, language_table =[],[]
languages = soup.findAll("table", {'class': 'wikitable sortable'})[0]
for lh in languages.findAll('th'):
language_table_header.append(lh.get_text().rstrip('\n'))
language_table_header, language_table = [], []
languages = soup.findAll("table", {"class": "wikitable sortable"})[0]
for lh in languages.findAll("th"):
language_table_header.append(lh.get_text().rstrip("\n"))

for r in languages.findAll("tr"):
temp = []
for cell in r.findAll('td'):
temp.append(cell.get_text().rstrip('\n'))
for cell in r.findAll("td"):
temp.append(cell.get_text().rstrip("\n"))
language_table.append(temp)

language_table =[x for x in language_table if x!=[]]
language_table = [x for x in language_table if x != []]

vob_table_header, vob_table =[], []
vob = soup.findAll("table", {'class' : 'wikitable sortable'})[1]
for vh in vob.findAll('th'):
vob_table_header.append(vh.get_text().rstrip('\n'))
vob_table_header, vob_table = [], []
vob = soup.findAll("table", {"class": "wikitable sortable"})[1]
for vh in vob.findAll("th"):
vob_table_header.append(vh.get_text().rstrip("\n"))

for v in vob.findAll('tr'):
for v in vob.findAll("tr"):
vtemp = []
for vcell in v.findAll('td'):
vtemp.append(vcell.get_text().rstrip('\n'))
for vcell in v.findAll("td"):
vtemp.append(vcell.get_text().rstrip("\n"))
vob_table.append(vtemp)

vob_table =[x for x in vob_table if x!=[]]
vob_table = [x for x in vob_table if x != []]

with open(self.dir.joinpath('raw', 'languages.csv').as_posix(),'w',newline='') as lw:
languagewriter = csv.writer(lw, delimiter=',', quotechar='"')
with open(self.dir.joinpath("raw", "languages.csv").as_posix(), "w", newline="") as lw:
languagewriter = csv.writer(lw, delimiter=",", quotechar='"')
languagewriter.writerow(language_table_header)
languagewriter.writerows(language_table)
lw.close()

with open(self.dir.joinpath('raw', 'raw.csv').as_posix(),'w',newline='') as vw:
vocabwriter = csv.writer(vw, delimiter=',', quotechar='"')
with open(self.dir.joinpath("raw", "raw.csv").as_posix(), "w", newline="") as vw:
vocabwriter = csv.writer(vw, delimiter=",", quotechar='"')
vocabwriter.writerow(vob_table_header)
vocabwriter.writerows(vob_table)
vw.close()

def clean_form(self, item, form):
if form not in ['*', '---', '-']:
form = strip_brackets(split_text(form, separators=';,/')[0])
return form.replace(' ', '_')
if form not in ["*", "---", "-"]:
form = strip_brackets(split_text(form, separators=";,/")[0])
return form.replace(" ", "_")

def cmd_install(self, **kw):
"""
Convert the raw data to a CLDF dataset.
"""

with open(self.dir.joinpath('raw','raw.csv').as_posix(),'r') as csvfile:
with open(self.dir.joinpath("raw", "raw.csv").as_posix(), "r") as csvfile:
reader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
data = [row for row in reader]
languages, concepts = [], {}
missing = defaultdict(int)
with self.cldf as ds:
#self.cldf.tokenize = lambda x, y: self.tokenizer(x, '^'+y+'$',
# self.cldf.tokenize = lambda x, y: self.tokenizer(x, '^'+y+'$',
# column='IPA')

for concept in self.concepts:
ds.add_concept(
ID=concept['NUMBER'],
Name=concept['GLOSS'],
Concepticon_ID=concept['CONCEPTICON_ID'],
Concepticon_Gloss=concept['CONCEPTICON_GLOSS']
)
concepts[concept['GLOSS']]=concept['NUMBER']
ID=concept["NUMBER"],
Name=concept["GLOSS"],
Concepticon_ID=concept["CONCEPTICON_ID"],
Concepticon_Gloss=concept["CONCEPTICON_GLOSS"],
)
concepts[concept["GLOSS"]] = concept["NUMBER"]

for language in self.languages:
ds.add_language(
ID=slug(language['Language_name']),
Glottocode=language['Glottolog_code'],
Name=language['Language_name']
)
languages.append(language['Language_name'])
ID=slug(language["Language_name"]),
Glottocode=language["Glottolog_code"],
Name=language["Language_name"],
)
languages.append(language["Language_name"])

ds.add_sources(*self.raw.read_bib())
missing={}
for cgloss, entry in tqdm(enumerate(data), desc='cldfify the data',
total=len(data)):
if entry['Chinese gloss'] in concepts.keys():
for language in languages:
value = self.lexemes.get(entry[language],
entry[language])
if value.strip():
ds.add_lexemes(
Language_ID = slug(language),
Parameter_ID = concepts[
entry['Chinese gloss']],
Value = value,
Source=['Chen2013']
)
missing = {}
for cgloss, entry in tqdm(enumerate(data), desc="cldfify the data", total=len(data)):
if entry["Chinese gloss"] in concepts.keys():
for language in languages:
value = self.lexemes.get(entry[language], entry[language])
if value.strip():
ds.add_lexemes(
Language_ID=slug(language),
Parameter_ID=concepts[entry["Chinese gloss"]],
Value=value,
Source=["Chen2013"],
)
else:
missing[entry['Chinese gloss']] +=1
missing[entry["Chinese gloss"]] += 1
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ testpaths = test.py
mock_use_standalone_module = true
addopts =
--cldf-metadata=cldf/cldf-metadata.json

9 changes: 4 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from setuptools import setup
import sys
import json


PY2 = sys.version_info.major == 2
with open('metadata.json', **({} if PY2 else {'encoding': 'utf-8'})) as fp:
with open('metadata.json') as fp:
metadata = json.load(fp)


Expand All @@ -23,8 +21,9 @@
]
},
install_requires=[
'pylexibank>=1.1.1',
'beautifulsoup4>=4.6'
'pylexibank==1.1.1',
'beautifulsoup4==4.7.1',
'segments==2.0.2'
],
extras_require={
'test': [
Expand Down
5 changes: 0 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,2 @@
# coding: utf-8
from __future__ import unicode_literals


def test_valid(cldf_dataset, cldf_logger):
assert cldf_dataset.validate(log=cldf_logger)

0 comments on commit fdcdbf8

Please sign in to comment.