Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.18.27-dev7

### Fixes
- Comment no-ops in `zoom_image` (codeflash)

## 0.18.27-dev4

### Fixes
Expand Down
16 changes: 16 additions & 0 deletions test_unstructured/cleaners/test_core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
import sys
import unicodedata

import pytest

Expand Down Expand Up @@ -300,3 +302,17 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc
def test_bytes_string_to_string():
text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"


def test_unicode_punctuations():
"""Test that punkt contains all Unicode punctuation characters for the current Python version.

The punkt list is generated from Unicode 15.0.0 (Python 3.12) to be a superset that works
across Python 3.10-3.12. Earlier Python versions may have fewer punctuation characters
in their unicodedata, but punkt should always contain at least those characters.
"""
runtime_punct = {
i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
}
# punkt should be a superset of the runtime Unicode punctuation
assert runtime_punct <= set(core.punkt)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.27-dev4" # pragma: no cover
__version__ = "0.18.27-dev7" # pragma: no cover
76 changes: 71 additions & 5 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import quopri
import re
import sys
import unicodedata
from typing import Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -303,9 +301,77 @@ def replace_unicode_quotes(text: str) -> str:
return text


tbl = dict.fromkeys(
i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
)
# fmt: off
# Unicode punctuation codepoints (category starting with "P") - generated for Unicode 15.0.0
# This is the superset covering Python 3.10-3.12. Earlier Python versions may have fewer
# punctuation characters but will still work correctly (extra entries are harmless).
punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123,
125, 161, 167, 171, 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375,
1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1565, 1566,
1567, 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800,
1801, 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, 2102,
2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800,
3191, 3204, 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852,
3853, 3854, 3855, 3856, 3857, 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050,
4051, 4052, 4057, 4058, 4170, 4171, 4172, 4173, 4174, 4175, 4347, 4960, 4961, 4962, 4963,
4964, 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, 5869, 5941, 5942, 6100,
6101, 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, 6153,
6154, 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826,
6827, 6828, 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7037, 7038, 7164, 7165, 7166,
7167, 7227, 7228, 7229, 7230, 7231, 7294, 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366,
7367, 7379, 8208, 8209, 8210, 8211, 8212, 8213, 8214, 8215, 8216, 8217, 8218, 8219, 8220,
8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, 8242, 8243,
8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258,
8259, 8261, 8262, 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275,
8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334,
8968, 8969, 8970, 8971, 9001, 9002, 10088, 10089, 10090, 10091, 10092, 10093, 10094,
10095, 10096, 10097, 10098, 10099, 10100, 10101, 10181, 10182, 10214, 10215, 10216,
10217, 10218, 10219, 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631,
10632, 10633, 10634, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643,
10644, 10645, 10646, 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513,
11514, 11515, 11516, 11518, 11519, 11632, 11776, 11777, 11778, 11779, 11780, 11781,
11782, 11783, 11784, 11785, 11786, 11787, 11788, 11789, 11790, 11791, 11792, 11793,
11794, 11795, 11796, 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805,
11806, 11807, 11808, 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817,
11818, 11819, 11820, 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830,
11831, 11832, 11833, 11834, 11835, 11836, 11837, 11838, 11839, 11840, 11841, 11842,
11843, 11844, 11845, 11846, 11847, 11848, 11849, 11850, 11851, 11852, 11853, 11854,
11855, 11858, 11859, 11860, 11861, 11862, 11863, 11864, 11865, 11866, 11867, 11868,
11869, 12289, 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, 12303,
12304, 12305, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317,
12318, 12319, 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611,
42622, 42738, 42739, 42740, 42741, 42742, 42743, 43124, 43125, 43126, 43127, 43214,
43215, 43256, 43257, 43258, 43260, 43310, 43311, 43359, 43457, 43458, 43459, 43460,
43461, 43462, 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, 43612,
43613, 43614, 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041,
65042, 65043, 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075,
65076, 65077, 65078, 65079, 65080, 65081, 65082, 65083, 65084, 65085, 65086, 65087,
65088, 65089, 65090, 65091, 65092, 65093, 65094, 65095, 65096, 65097, 65098, 65099,
65100, 65101, 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, 65112,
65113, 65114, 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130,
65131, 65281, 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293,
65294, 65295, 65306, 65307, 65311, 65312, 65339, 65340, 65341, 65343, 65371, 65373,
65375, 65376, 65377, 65378, 65379, 65380, 65381, 65792, 65793, 65794, 66463, 66512,
66927, 67671, 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, 68183,
68184, 68223, 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411,
68412, 68413, 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463,
69464, 69465, 69510, 69511, 69512, 69513, 69703, 69704, 69705, 69706, 69707, 69708,
69709, 69819, 69820, 69822, 69823, 69824, 69825, 69952, 69953, 69954, 69955, 70004,
70005, 70085, 70086, 70087, 70088, 70093, 70107, 70109, 70110, 70111, 70200, 70201,
70202, 70203, 70204, 70205, 70313, 70731, 70732, 70733, 70734, 70735, 70746, 70747,
70749, 70854, 71105, 71106, 71107, 71108, 71109, 71110, 71111, 71112, 71113, 71114,
71115, 71116, 71117, 71118, 71119, 71120, 71121, 71122, 71123, 71124, 71125, 71126,
71127, 71233, 71234, 71235, 71264, 71265, 71266, 71267, 71268, 71269, 71270, 71271,
71272, 71273, 71274, 71275, 71276, 71353, 71484, 71485, 71486, 71739, 72004, 72005,
72006, 72162, 72255, 72256, 72257, 72258, 72259, 72260, 72261, 72262, 72346, 72347,
72348, 72350, 72351, 72352, 72353, 72354, 72448, 72449, 72450, 72451, 72452, 72453,
72454, 72455, 72456, 72457, 72769, 72770, 72771, 72772, 72773, 72816, 72817, 73463,
73464, 73539, 73540, 73541, 73542, 73543, 73544, 73545, 73546, 73547, 73548, 73549,
73550, 73551, 73727, 74864, 74865, 74866, 74867, 74868, 77809, 77810, 92782, 92783,
92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178,
113823, 121479, 121480, 121481, 121482, 121483, 125278, 125279]
# fmt: on
tbl = dict.fromkeys(punkt)


def remove_punctuation(s: str) -> str:
Expand Down
Loading