diff --git a/CHANGELOG.md b/CHANGELOG.md index f51e422e02..2992d0f1d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.27-dev7 + +### Fixes +- Comment no-ops in `zoom_image` (codeflash) + ## 0.18.27-dev4 ### Fixes diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index eec8edd2b9..bbf97a3b92 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -1,4 +1,6 @@ import re +import sys +import unicodedata import pytest @@ -300,3 +302,17 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc def test_bytes_string_to_string(): text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" + + +def test_unicode_punctuations(): + """Test that punkt contains all Unicode punctuation characters for the current Python version. + + The punkt list is generated from Unicode 15.0.0 (Python 3.12) to be a superset that works + across Python 3.10-3.12. Earlier Python versions may have fewer punctuation characters + in their unicodedata, but punkt should always contain at least those characters. + """ + runtime_punct = { + i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") + } + # punkt should be a superset of the runtime Unicode punctuation + assert runtime_punct <= set(core.punkt) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cd393308d2..7f17a89593 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev4" # pragma: no cover +__version__ = "0.18.27-dev7" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 22c77b9044..bdade2b6c6 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,8 +2,6 @@ import quopri import re -import sys -import unicodedata from typing import Optional, Tuple import numpy as np @@ -303,9 +301,77 @@ def replace_unicode_quotes(text: str) -> str: return text -tbl = dict.fromkeys( - i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") -) +# fmt: off +# Unicode punctuation codepoints (category starting with "P") - generated for Unicode 15.0.0 +# This is the superset covering Python 3.10-3.12. Earlier Python versions may have fewer +# punctuation characters but will still work correctly (extra entries are harmless). +punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, + 125, 161, 167, 171, 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, + 1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1565, 1566, + 1567, 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, + 1801, 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, 2102, + 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, + 3191, 3204, 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, + 3853, 3854, 3855, 3856, 3857, 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, + 4051, 4052, 4057, 4058, 4170, 4171, 4172, 4173, 4174, 4175, 4347, 4960, 4961, 4962, 4963, + 4964, 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, 5869, 5941, 5942, 6100, + 6101, 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, 6153, + 6154, 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, + 6827, 6828, 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7037, 7038, 7164, 7165, 7166, + 7167, 7227, 7228, 7229, 7230, 7231, 7294, 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, + 7367, 7379, 8208, 8209, 8210, 8211, 8212, 8213, 8214, 8215, 8216, 8217, 8218, 8219, 8220, + 8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, 8242, 8243, + 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, + 8259, 8261, 8262, 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, + 8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, + 8968, 8969, 8970, 8971, 9001, 9002, 10088, 10089, 10090, 10091, 10092, 10093, 10094, + 10095, 10096, 10097, 10098, 10099, 10100, 10101, 10181, 10182, 10214, 10215, 10216, + 10217, 10218, 10219, 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, + 10632, 10633, 10634, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, + 10644, 10645, 10646, 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, + 11514, 11515, 11516, 11518, 11519, 11632, 11776, 11777, 11778, 11779, 11780, 11781, + 11782, 11783, 11784, 11785, 11786, 11787, 11788, 11789, 11790, 11791, 11792, 11793, + 11794, 11795, 11796, 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, + 11806, 11807, 11808, 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, + 11818, 11819, 11820, 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, + 11831, 11832, 11833, 11834, 11835, 11836, 11837, 11838, 11839, 11840, 11841, 11842, + 11843, 11844, 11845, 11846, 11847, 11848, 11849, 11850, 11851, 11852, 11853, 11854, + 11855, 11858, 11859, 11860, 11861, 11862, 11863, 11864, 11865, 11866, 11867, 11868, + 11869, 12289, 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, 12303, + 12304, 12305, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, + 12318, 12319, 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, + 42622, 42738, 42739, 42740, 42741, 42742, 42743, 43124, 43125, 43126, 43127, 43214, + 43215, 43256, 43257, 43258, 43260, 43310, 43311, 43359, 43457, 43458, 43459, 43460, + 43461, 43462, 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, 43612, + 43613, 43614, 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, + 65042, 65043, 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, + 65076, 65077, 65078, 65079, 65080, 65081, 65082, 65083, 65084, 65085, 65086, 65087, + 65088, 65089, 65090, 65091, 65092, 65093, 65094, 65095, 65096, 65097, 65098, 65099, + 65100, 65101, 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, 65112, + 65113, 65114, 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, + 65131, 65281, 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, + 65294, 65295, 65306, 65307, 65311, 65312, 65339, 65340, 65341, 65343, 65371, 65373, + 65375, 65376, 65377, 65378, 65379, 65380, 65381, 65792, 65793, 65794, 66463, 66512, + 66927, 67671, 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, 68183, + 68184, 68223, 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, + 68412, 68413, 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, + 69464, 69465, 69510, 69511, 69512, 69513, 69703, 69704, 69705, 69706, 69707, 69708, + 69709, 69819, 69820, 69822, 69823, 69824, 69825, 69952, 69953, 69954, 69955, 70004, + 70005, 70085, 70086, 70087, 70088, 70093, 70107, 70109, 70110, 70111, 70200, 70201, + 70202, 70203, 70204, 70205, 70313, 70731, 70732, 70733, 70734, 70735, 70746, 70747, + 70749, 70854, 71105, 71106, 71107, 71108, 71109, 71110, 71111, 71112, 71113, 71114, + 71115, 71116, 71117, 71118, 71119, 71120, 71121, 71122, 71123, 71124, 71125, 71126, + 71127, 71233, 71234, 71235, 71264, 71265, 71266, 71267, 71268, 71269, 71270, 71271, + 71272, 71273, 71274, 71275, 71276, 71353, 71484, 71485, 71486, 71739, 72004, 72005, + 72006, 72162, 72255, 72256, 72257, 72258, 72259, 72260, 72261, 72262, 72346, 72347, + 72348, 72350, 72351, 72352, 72353, 72354, 72448, 72449, 72450, 72451, 72452, 72453, + 72454, 72455, 72456, 72457, 72769, 72770, 72771, 72772, 72773, 72816, 72817, 73463, + 73464, 73539, 73540, 73541, 73542, 73543, 73544, 73545, 73546, 73547, 73548, 73549, + 73550, 73551, 73727, 74864, 74865, 74866, 74867, 74868, 77809, 77810, 92782, 92783, + 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178, + 113823, 121479, 121480, 121481, 121482, 121483, 125278, 125279] +# fmt: on +tbl = dict.fromkeys(punkt) def remove_punctuation(s: str) -> str: