diff --git a/README.md b/README.md index 035133c..76fcc5a 100644 --- a/README.md +++ b/README.md @@ -100,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`. If you are summarizing many reports and have used the `--differences` flag while generating them, it may be useful to limit the number of differences reported by using -the `--occurences-threshold` parameter. This will reduce the size of the generated HTML +the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML report, making it easier to open and navigate. Note that the JSON report will still contain all differences. Example: ~~~ -dinglehopper-summarize output_folder/ --occurences-threshold 10 +dinglehopper-summarize output_folder/ --occurrences-threshold 10 ~~~ ### dinglehopper-line-dirs diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py index c7bcba7..6dcf0a7 100644 --- a/src/dinglehopper/extracted_text.py +++ b/src/dinglehopper/extracted_text.py @@ -329,7 +329,7 @@ def get_attr(te: Any, attr_name: str) -> float: """Extract the attribute for the given name. Note: currently only handles numeric values! - Other or non existend values are encoded as np.nan. + Other or non existent values are encoded as np.nan. """ attr_value = te.attrib.get(attr_name) try: diff --git a/src/dinglehopper/notebooks/Levenshtein.ipynb b/src/dinglehopper/notebooks/Levenshtein.ipynb index 876bee3..b9671d7 100644 --- a/src/dinglehopper/notebooks/Levenshtein.ipynb +++ b/src/dinglehopper/notebooks/Levenshtein.ipynb @@ -391,7 +391,7 @@ "\\text{CER} = \\frac{i + s + d}{n}\n", "$$\n", "\n", - "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)" + "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)" ] }, { @@ -680,7 +680,7 @@ " return cat in unwanted_categories or subcat in unwanted_subcategories\n", "\n", " # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n", - " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n", + " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n", " for word in uniseg.wordbreak.words(s):\n", " if all(unwanted(c) for c in word):\n", " pass\n", diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index b759a69..578850f 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -54,7 +54,7 @@ def unwanted(c): # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain - # only whitespace, punctation "or similar characters." + # only whitespace, punctuation "or similar characters." for word in uniseg.wordbreak.words(s): if all(unwanted(c) for c in word): pass