From 2363c2e7ef9804da3c1cf7def30def1888fba225 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 19:25:33 -0800 Subject: [PATCH 1/5] cache unicode punctuation indices --- test_unstructured/cleaners/test_core.py | 9 +++- unstructured/cleaners/core.py | 57 ++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index eec8edd2b9..13fd3f69fc 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -1,5 +1,6 @@ import re - +import sys +import unicodedata import pytest from unstructured.cleaners import core @@ -300,3 +301,9 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc def test_bytes_string_to_string(): text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" + +def test_unicode_punctuations(): + tbl = set( + i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") + ) + assert set(core.punkt) == tbl \ No newline at end of file diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 10fc83a180..066df5f6a1 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,8 +2,6 @@ import quopri import re -import sys -import unicodedata from typing import Optional, Tuple import numpy as np @@ -302,10 +300,57 @@ def replace_unicode_quotes(text: str) -> str: text = text.replace("â\x80s'", "") return text - -tbl = dict.fromkeys( - i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") -) +punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, 125, 161, 167, 171, + 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, 1417, 1418, 1470, 1472, 1475, 1478, 1523, + 1524, 1545, 1546, 1548, 1549, 1563, 1566, 1567, 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, + 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, + 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, 3191, 3204, + 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, 3854, 3855, 3856, 3857, + 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, 4051, 4052, 4057, 4058, 4170, 4171, 4172, 4173, + 4174, 4175, 4347, 4960, 4961, 4962, 4963, 4964, 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, + 5869, 5941, 5942, 6100, 6101, 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, + 6153, 6154, 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, 6827, 6828, + 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7164, 7165, 7166, 7167, 7227, 7228, 7229, 7230, 7231, 7294, + 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, 7367, 7379, 8208, 8209, 8210, 8211, 8212, 8213, 8214, 8215, + 8216, 8217, 8218, 8219, 8220, 8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, + 8242, 8243, 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, + 8261, 8262, 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, 8276, 8277, 8278, 8279, + 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, 8968, 8969, 8970, 8971, 9001, 9002, 10088, + 10089, 10090, 10091, 10092, 10093, 10094, 10095, 10096, 10097, 10098, 10099, 10100, 10101, 10181, 10182, + 10214, 10215, 10216, 10217, 10218, 10219, 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, + 10632, 10633, 10634, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, + 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, 11514, 11515, 11516, 11518, 11519, 11632, + 11776, 11777, 11778, 11779, 11780, 11781, 11782, 11783, 11784, 11785, 11786, 11787, 11788, 11789, 11790, + 11791, 11792, 11793, 11794, 11795, 11796, 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, + 11806, 11807, 11808, 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, 11818, 11819, 11820, + 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, 11831, 11832, 11833, 11834, 11835, 11836, + 11837, 11838, 11839, 11840, 11841, 11842, 11843, 11844, 11845, 11846, 11847, 11848, 11849, 11850, 11851, + 11852, 11853, 11854, 11855, 11858, 12289, 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, + 12303, 12304, 12305, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, 12318, 12319, + 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, 42622, 42738, 42739, 42740, 42741, + 42742, 42743, 43124, 43125, 43126, 43127, 43214, 43215, 43256, 43257, 43258, 43260, 43310, 43311, 43359, + 43457, 43458, 43459, 43460, 43461, 43462, 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, + 43612, 43613, 43614, 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, 65042, 65043, + 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, 65076, 65077, 65078, 65079, 65080, + 65081, 65082, 65083, 65084, 65085, 65086, 65087, 65088, 65089, 65090, 65091, 65092, 65093, 65094, 65095, + 65096, 65097, 65098, 65099, 65100, 65101, 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, + 65112, 65113, 65114, 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, 65131, 65281, + 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, 65294, 65295, 65306, 65307, 65311, + 65312, 65339, 65340, 65341, 65343, 65371, 65373, 65375, 65376, 65377, 65378, 65379, 65380, 65381, 65792, + 65793, 65794, 66463, 66512, 66927, 67671, 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, + 68183, 68184, 68223, 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, 68412, 68413, + 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, 69464, 69465, 69703, 69704, 69705, + 69706, 69707, 69708, 69709, 69819, 69820, 69822, 69823, 69824, 69825, 69952, 69953, 69954, 69955, 70004, + 70005, 70085, 70086, 70087, 70088, 70093, 70107, 70109, 70110, 70111, 70200, 70201, 70202, 70203, 70204, + 70205, 70313, 70731, 70732, 70733, 70734, 70735, 70746, 70747, 70749, 70854, 71105, 71106, 71107, 71108, + 71109, 71110, 71111, 71112, 71113, 71114, 71115, 71116, 71117, 71118, 71119, 71120, 71121, 71122, 71123, + 71124, 71125, 71126, 71127, 71233, 71234, 71235, 71264, 71265, 71266, 71267, 71268, 71269, 71270, 71271, + 71272, 71273, 71274, 71275, 71276, 71484, 71485, 71486, 71739, 72004, 72005, 72006, 72162, 72255, 72256, + 72257, 72258, 72259, 72260, 72261, 72262, 72346, 72347, 72348, 72350, 72351, 72352, 72353, 72354, 72769, + 72770, 72771, 72772, 72773, 72816, 72817, 73463, 73464, 73727, 74864, 74865, 74866, 74867, 74868, 92782, + 92783, 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178, 113823, 121479, + 121480, 121481, 121482, 121483, 125278, 125279] +tbl = dict.fromkeys(punkt) def remove_punctuation(s: str) -> str: From 3d3de3e63a9b132d9cc7dd960667ce0a133cdf1e Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 19:36:22 -0800 Subject: [PATCH 2/5] linter fixes --- test_unstructured/cleaners/test_core.py | 7 +- unstructured/cleaners/core.py | 112 +++++++++++++----------- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 13fd3f69fc..2ae6a250b1 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -1,6 +1,7 @@ import re import sys import unicodedata + import pytest from unstructured.cleaners import core @@ -303,7 +304,7 @@ def test_bytes_string_to_string(): assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" def test_unicode_punctuations(): - tbl = set( + tbl = { i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") - ) - assert set(core.punkt) == tbl \ No newline at end of file + } + assert set(core.punkt) == tbl diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 066df5f6a1..f4e80e3dce 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -300,56 +300,68 @@ def replace_unicode_quotes(text: str) -> str: text = text.replace("â\x80s'", "") return text -punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, 125, 161, 167, 171, - 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, 1417, 1418, 1470, 1472, 1475, 1478, 1523, - 1524, 1545, 1546, 1548, 1549, 1563, 1566, 1567, 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, - 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, - 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, 3191, 3204, - 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, 3854, 3855, 3856, 3857, - 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, 4051, 4052, 4057, 4058, 4170, 4171, 4172, 4173, - 4174, 4175, 4347, 4960, 4961, 4962, 4963, 4964, 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, - 5869, 5941, 5942, 6100, 6101, 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, - 6153, 6154, 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, 6827, 6828, - 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7164, 7165, 7166, 7167, 7227, 7228, 7229, 7230, 7231, 7294, - 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, 7367, 7379, 8208, 8209, 8210, 8211, 8212, 8213, 8214, 8215, - 8216, 8217, 8218, 8219, 8220, 8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, - 8242, 8243, 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, - 8261, 8262, 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, 8276, 8277, 8278, 8279, - 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, 8968, 8969, 8970, 8971, 9001, 9002, 10088, - 10089, 10090, 10091, 10092, 10093, 10094, 10095, 10096, 10097, 10098, 10099, 10100, 10101, 10181, 10182, - 10214, 10215, 10216, 10217, 10218, 10219, 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, - 10632, 10633, 10634, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, - 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, 11514, 11515, 11516, 11518, 11519, 11632, - 11776, 11777, 11778, 11779, 11780, 11781, 11782, 11783, 11784, 11785, 11786, 11787, 11788, 11789, 11790, - 11791, 11792, 11793, 11794, 11795, 11796, 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, - 11806, 11807, 11808, 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, 11818, 11819, 11820, - 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, 11831, 11832, 11833, 11834, 11835, 11836, - 11837, 11838, 11839, 11840, 11841, 11842, 11843, 11844, 11845, 11846, 11847, 11848, 11849, 11850, 11851, - 11852, 11853, 11854, 11855, 11858, 12289, 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, - 12303, 12304, 12305, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, 12318, 12319, - 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, 42622, 42738, 42739, 42740, 42741, - 42742, 42743, 43124, 43125, 43126, 43127, 43214, 43215, 43256, 43257, 43258, 43260, 43310, 43311, 43359, - 43457, 43458, 43459, 43460, 43461, 43462, 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, - 43612, 43613, 43614, 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, 65042, 65043, - 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, 65076, 65077, 65078, 65079, 65080, - 65081, 65082, 65083, 65084, 65085, 65086, 65087, 65088, 65089, 65090, 65091, 65092, 65093, 65094, 65095, - 65096, 65097, 65098, 65099, 65100, 65101, 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, - 65112, 65113, 65114, 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, 65131, 65281, - 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, 65294, 65295, 65306, 65307, 65311, - 65312, 65339, 65340, 65341, 65343, 65371, 65373, 65375, 65376, 65377, 65378, 65379, 65380, 65381, 65792, - 65793, 65794, 66463, 66512, 66927, 67671, 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, - 68183, 68184, 68223, 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, 68412, 68413, - 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, 69464, 69465, 69703, 69704, 69705, - 69706, 69707, 69708, 69709, 69819, 69820, 69822, 69823, 69824, 69825, 69952, 69953, 69954, 69955, 70004, - 70005, 70085, 70086, 70087, 70088, 70093, 70107, 70109, 70110, 70111, 70200, 70201, 70202, 70203, 70204, - 70205, 70313, 70731, 70732, 70733, 70734, 70735, 70746, 70747, 70749, 70854, 71105, 71106, 71107, 71108, - 71109, 71110, 71111, 71112, 71113, 71114, 71115, 71116, 71117, 71118, 71119, 71120, 71121, 71122, 71123, - 71124, 71125, 71126, 71127, 71233, 71234, 71235, 71264, 71265, 71266, 71267, 71268, 71269, 71270, 71271, - 71272, 71273, 71274, 71275, 71276, 71484, 71485, 71486, 71739, 72004, 72005, 72006, 72162, 72255, 72256, - 72257, 72258, 72259, 72260, 72261, 72262, 72346, 72347, 72348, 72350, 72351, 72352, 72353, 72354, 72769, - 72770, 72771, 72772, 72773, 72816, 72817, 73463, 73464, 73727, 74864, 74865, 74866, 74867, 74868, 92782, - 92783, 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178, 113823, 121479, - 121480, 121481, 121482, 121483, 125278, 125279] +punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, + 125, 161, 167, 171, 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, + 1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1566, 1567, + 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, + 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, + 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, 3191, + 3204, 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, + 3854, 3855, 3856, 3857, 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, 4051, + 4052, 4057, 4058, 4170, 4171, 4172, 4173, 4174, 4175, 4347, 4960, 4961, 4962, 4963, 4964, + 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, 5869, 5941, 5942, 6100, 6101, + 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, 6153, 6154, + 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, 6827, + 6828, 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7164, 7165, 7166, 7167, 7227, 7228, + 7229, 7230, 7231, 7294, 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, 7367, 7379, 8208, + 8209, 8210, 8211, 8212, 8213, 8214, 8215, 8216, 8217, 8218, 8219, 8220, 8221, 8222, 8223, + 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, 8242, 8243, 8244, 8245, 8246, + 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, 8261, 8262, + 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, 8276, 8277, 8278, + 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, 8968, 8969, 8970, + 8971, 9001, 9002, 10088, 10089, 10090, 10091, 10092, 10093, 10094, 10095, 10096, 10097, + 10098, 10099, 10100, 10101, 10181, 10182, 10214, 10215, 10216, 10217, 10218, 10219, + 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, 10632, 10633, 10634, + 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, + 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, 11514, 11515, 11516, + 11518, 11519, 11632, 11776, 11777, 11778, 11779, 11780, 11781, 11782, 11783, 11784, + 11785, 11786, 11787, 11788, 11789, 11790, 11791, 11792, 11793, 11794, 11795, 11796, + 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, 11806, 11807, 11808, + 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, 11818, 11819, 11820, + 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, 11831, 11832, 11833, + 11834, 11835, 11836, 11837, 11838, 11839, 11840, 11841, 11842, 11843, 11844, 11845, + 11846, 11847, 11848, 11849, 11850, 11851, 11852, 11853, 11854, 11855, 11858, 12289, + 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, + 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, 12318, 12319, + 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, 42622, 42738, + 42739, 42740, 42741, 42742, 42743, 43124, 43125, 43126, 43127, 43214, 43215, 43256, + 43257, 43258, 43260, 43310, 43311, 43359, 43457, 43458, 43459, 43460, 43461, 43462, + 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, 43612, 43613, 43614, + 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, 65042, 65043, + 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, 65076, 65077, + 65078, 65079, 65080, 65081, 65082, 65083, 65084, 65085, 65086, 65087, 65088, 65089, + 65090, 65091, 65092, 65093, 65094, 65095, 65096, 65097, 65098, 65099, 65100, 65101, + 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, 65112, 65113, 65114, + 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, 65131, 65281, + 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, 65294, 65295, + 65306, 65307, 65311, 65312, 65339, 65340, 65341, 65343, 65371, 65373, 65375, 65376, + 65377, 65378, 65379, 65380, 65381, 65792, 65793, 65794, 66463, 66512, 66927, 67671, + 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, 68183, 68184, 68223, + 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, 68412, 68413, + 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, 69464, 69465, + 69703, 69704, 69705, 69706, 69707, 69708, 69709, 69819, 69820, 69822, 69823, 69824, + 69825, 69952, 69953, 69954, 69955, 70004, 70005, 70085, 70086, 70087, 70088, 70093, + 70107, 70109, 70110, 70111, 70200, 70201, 70202, 70203, 70204, 70205, 70313, 70731, + 70732, 70733, 70734, 70735, 70746, 70747, 70749, 70854, 71105, 71106, 71107, 71108, + 71109, 71110, 71111, 71112, 71113, 71114, 71115, 71116, 71117, 71118, 71119, 71120, + 71121, 71122, 71123, 71124, 71125, 71126, 71127, 71233, 71234, 71235, 71264, 71265, + 71266, 71267, 71268, 71269, 71270, 71271, 71272, 71273, 71274, 71275, 71276, 71484, + 71485, 71486, 71739, 72004, 72005, 72006, 72162, 72255, 72256, 72257, 72258, 72259, + 72260, 72261, 72262, 72346, 72347, 72348, 72350, 72351, 72352, 72353, 72354, 72769, + 72770, 72771, 72772, 72773, 72816, 72817, 73463, 73464, 73727, 74864, 74865, 74866, + 74867, 74868, 92782, 92783, 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, + 93848, 93849, 93850, 94178, 113823, 121479, 121480, 121481, 121482, 121483, 125278, + 125279] tbl = dict.fromkeys(punkt) From d45c4f396717c88db0b727b6bdd417137013b230 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 19:52:50 -0800 Subject: [PATCH 3/5] changelog and version --- CHANGELOG.md | 5 +++++ unstructured/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f51e422e02..2992d0f1d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.27-dev7 + +### Fixes +- Comment no-ops in `zoom_image` (codeflash) + ## 0.18.27-dev4 ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cd393308d2..7f17a89593 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev4" # pragma: no cover +__version__ = "0.18.27-dev7" # pragma: no cover From 9059892f738b7baa2657acef9788d0c0b7a2abd5 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 19:58:29 -0800 Subject: [PATCH 4/5] formatting fix --- test_unstructured/cleaners/test_core.py | 5 ++--- unstructured/cleaners/core.py | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 2ae6a250b1..b950e0ce4d 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -303,8 +303,7 @@ def test_bytes_string_to_string(): text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" + def test_unicode_punctuations(): - tbl = { - i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") - } + tbl = {i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")} assert set(core.punkt) == tbl diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 6951f9fa00..89e3894a7d 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -300,6 +300,8 @@ def replace_unicode_quotes(text: str) -> str: text = text.replace("â\x80s'", "") return text + +# fmt: off punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, 125, 161, 167, 171, 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, 1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1566, 1567, @@ -362,6 +364,7 @@ def replace_unicode_quotes(text: str) -> str: 74867, 74868, 92782, 92783, 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178, 113823, 121479, 121480, 121481, 121482, 121483, 125278, 125279] +# fmt: on tbl = dict.fromkeys(punkt) From b3a26365a4aec0e57c3bad00a400a6e1b48bd654 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 21:17:05 -0800 Subject: [PATCH 5/5] unicode superset to cover python 3.10-3.12 --- test_unstructured/cleaners/test_core.py | 13 ++- unstructured/cleaners/core.py | 126 +++++++++++++----------- 2 files changed, 77 insertions(+), 62 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index b950e0ce4d..bbf97a3b92 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -305,5 +305,14 @@ def test_bytes_string_to_string(): def test_unicode_punctuations(): - tbl = {i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")} - assert set(core.punkt) == tbl + """Test that punkt contains all Unicode punctuation characters for the current Python version. + + The punkt list is generated from Unicode 15.0.0 (Python 3.12) to be a superset that works + across Python 3.10-3.12. Earlier Python versions may have fewer punctuation characters + in their unicodedata, but punkt should always contain at least those characters. + """ + runtime_punct = { + i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") + } + # punkt should be a superset of the runtime Unicode punctuation + assert runtime_punct <= set(core.punkt) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 89e3894a7d..bdade2b6c6 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -302,68 +302,74 @@ def replace_unicode_quotes(text: str) -> str: # fmt: off +# Unicode punctuation codepoints (category starting with "P") - generated for Unicode 15.0.0 +# This is the superset covering Python 3.10-3.12. Earlier Python versions may have fewer +# punctuation characters but will still work correctly (extra entries are harmless). punkt = [33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 58, 59, 63, 64, 91, 92, 93, 95, 123, 125, 161, 167, 171, 182, 183, 187, 191, 894, 903, 1370, 1371, 1372, 1373, 1374, 1375, - 1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1566, 1567, - 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, - 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, - 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, 3191, - 3204, 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, - 3854, 3855, 3856, 3857, 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, 4051, - 4052, 4057, 4058, 4170, 4171, 4172, 4173, 4174, 4175, 4347, 4960, 4961, 4962, 4963, 4964, - 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, 5869, 5941, 5942, 6100, 6101, - 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, 6153, 6154, - 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, 6827, - 6828, 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7164, 7165, 7166, 7167, 7227, 7228, - 7229, 7230, 7231, 7294, 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, 7367, 7379, 8208, - 8209, 8210, 8211, 8212, 8213, 8214, 8215, 8216, 8217, 8218, 8219, 8220, 8221, 8222, 8223, - 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, 8242, 8243, 8244, 8245, 8246, - 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, 8261, 8262, - 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, 8276, 8277, 8278, - 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, 8968, 8969, 8970, - 8971, 9001, 9002, 10088, 10089, 10090, 10091, 10092, 10093, 10094, 10095, 10096, 10097, - 10098, 10099, 10100, 10101, 10181, 10182, 10214, 10215, 10216, 10217, 10218, 10219, - 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, 10632, 10633, 10634, - 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, - 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, 11514, 11515, 11516, - 11518, 11519, 11632, 11776, 11777, 11778, 11779, 11780, 11781, 11782, 11783, 11784, - 11785, 11786, 11787, 11788, 11789, 11790, 11791, 11792, 11793, 11794, 11795, 11796, - 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, 11806, 11807, 11808, - 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, 11818, 11819, 11820, - 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, 11831, 11832, 11833, - 11834, 11835, 11836, 11837, 11838, 11839, 11840, 11841, 11842, 11843, 11844, 11845, - 11846, 11847, 11848, 11849, 11850, 11851, 11852, 11853, 11854, 11855, 11858, 12289, - 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, - 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, 12318, 12319, - 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, 42622, 42738, - 42739, 42740, 42741, 42742, 42743, 43124, 43125, 43126, 43127, 43214, 43215, 43256, - 43257, 43258, 43260, 43310, 43311, 43359, 43457, 43458, 43459, 43460, 43461, 43462, - 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, 43612, 43613, 43614, - 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, 65042, 65043, - 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, 65076, 65077, - 65078, 65079, 65080, 65081, 65082, 65083, 65084, 65085, 65086, 65087, 65088, 65089, - 65090, 65091, 65092, 65093, 65094, 65095, 65096, 65097, 65098, 65099, 65100, 65101, - 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, 65112, 65113, 65114, - 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, 65131, 65281, - 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, 65294, 65295, - 65306, 65307, 65311, 65312, 65339, 65340, 65341, 65343, 65371, 65373, 65375, 65376, - 65377, 65378, 65379, 65380, 65381, 65792, 65793, 65794, 66463, 66512, 66927, 67671, - 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, 68183, 68184, 68223, - 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, 68412, 68413, - 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, 69464, 69465, - 69703, 69704, 69705, 69706, 69707, 69708, 69709, 69819, 69820, 69822, 69823, 69824, - 69825, 69952, 69953, 69954, 69955, 70004, 70005, 70085, 70086, 70087, 70088, 70093, - 70107, 70109, 70110, 70111, 70200, 70201, 70202, 70203, 70204, 70205, 70313, 70731, - 70732, 70733, 70734, 70735, 70746, 70747, 70749, 70854, 71105, 71106, 71107, 71108, - 71109, 71110, 71111, 71112, 71113, 71114, 71115, 71116, 71117, 71118, 71119, 71120, - 71121, 71122, 71123, 71124, 71125, 71126, 71127, 71233, 71234, 71235, 71264, 71265, - 71266, 71267, 71268, 71269, 71270, 71271, 71272, 71273, 71274, 71275, 71276, 71484, - 71485, 71486, 71739, 72004, 72005, 72006, 72162, 72255, 72256, 72257, 72258, 72259, - 72260, 72261, 72262, 72346, 72347, 72348, 72350, 72351, 72352, 72353, 72354, 72769, - 72770, 72771, 72772, 72773, 72816, 72817, 73463, 73464, 73727, 74864, 74865, 74866, - 74867, 74868, 92782, 92783, 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, - 93848, 93849, 93850, 94178, 113823, 121479, 121480, 121481, 121482, 121483, 125278, - 125279] + 1417, 1418, 1470, 1472, 1475, 1478, 1523, 1524, 1545, 1546, 1548, 1549, 1563, 1565, 1566, + 1567, 1642, 1643, 1644, 1645, 1748, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, + 1801, 1802, 1803, 1804, 1805, 2039, 2040, 2041, 2096, 2097, 2098, 2099, 2100, 2101, 2102, + 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2142, 2404, 2405, 2416, 2557, 2678, 2800, + 3191, 3204, 3572, 3663, 3674, 3675, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, + 3853, 3854, 3855, 3856, 3857, 3858, 3860, 3898, 3899, 3900, 3901, 3973, 4048, 4049, 4050, + 4051, 4052, 4057, 4058, 4170, 4171, 4172, 4173, 4174, 4175, 4347, 4960, 4961, 4962, 4963, + 4964, 4965, 4966, 4967, 4968, 5120, 5742, 5787, 5788, 5867, 5868, 5869, 5941, 5942, 6100, + 6101, 6102, 6104, 6105, 6106, 6144, 6145, 6146, 6147, 6148, 6149, 6150, 6151, 6152, 6153, + 6154, 6468, 6469, 6686, 6687, 6816, 6817, 6818, 6819, 6820, 6821, 6822, 6824, 6825, 6826, + 6827, 6828, 6829, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7037, 7038, 7164, 7165, 7166, + 7167, 7227, 7228, 7229, 7230, 7231, 7294, 7295, 7360, 7361, 7362, 7363, 7364, 7365, 7366, + 7367, 7379, 8208, 8209, 8210, 8211, 8212, 8213, 8214, 8215, 8216, 8217, 8218, 8219, 8220, + 8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 8229, 8230, 8231, 8240, 8241, 8242, 8243, + 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251, 8252, 8253, 8254, 8255, 8256, 8257, 8258, + 8259, 8261, 8262, 8263, 8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8275, + 8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8317, 8318, 8333, 8334, + 8968, 8969, 8970, 8971, 9001, 9002, 10088, 10089, 10090, 10091, 10092, 10093, 10094, + 10095, 10096, 10097, 10098, 10099, 10100, 10101, 10181, 10182, 10214, 10215, 10216, + 10217, 10218, 10219, 10220, 10221, 10222, 10223, 10627, 10628, 10629, 10630, 10631, + 10632, 10633, 10634, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, + 10644, 10645, 10646, 10647, 10648, 10712, 10713, 10714, 10715, 10748, 10749, 11513, + 11514, 11515, 11516, 11518, 11519, 11632, 11776, 11777, 11778, 11779, 11780, 11781, + 11782, 11783, 11784, 11785, 11786, 11787, 11788, 11789, 11790, 11791, 11792, 11793, + 11794, 11795, 11796, 11797, 11798, 11799, 11800, 11801, 11802, 11803, 11804, 11805, + 11806, 11807, 11808, 11809, 11810, 11811, 11812, 11813, 11814, 11815, 11816, 11817, + 11818, 11819, 11820, 11821, 11822, 11824, 11825, 11826, 11827, 11828, 11829, 11830, + 11831, 11832, 11833, 11834, 11835, 11836, 11837, 11838, 11839, 11840, 11841, 11842, + 11843, 11844, 11845, 11846, 11847, 11848, 11849, 11850, 11851, 11852, 11853, 11854, + 11855, 11858, 11859, 11860, 11861, 11862, 11863, 11864, 11865, 11866, 11867, 11868, + 11869, 12289, 12290, 12291, 12296, 12297, 12298, 12299, 12300, 12301, 12302, 12303, + 12304, 12305, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, + 12318, 12319, 12336, 12349, 12448, 12539, 42238, 42239, 42509, 42510, 42511, 42611, + 42622, 42738, 42739, 42740, 42741, 42742, 42743, 43124, 43125, 43126, 43127, 43214, + 43215, 43256, 43257, 43258, 43260, 43310, 43311, 43359, 43457, 43458, 43459, 43460, + 43461, 43462, 43463, 43464, 43465, 43466, 43467, 43468, 43469, 43486, 43487, 43612, + 43613, 43614, 43615, 43742, 43743, 43760, 43761, 44011, 64830, 64831, 65040, 65041, + 65042, 65043, 65044, 65045, 65046, 65047, 65048, 65049, 65072, 65073, 65074, 65075, + 65076, 65077, 65078, 65079, 65080, 65081, 65082, 65083, 65084, 65085, 65086, 65087, + 65088, 65089, 65090, 65091, 65092, 65093, 65094, 65095, 65096, 65097, 65098, 65099, + 65100, 65101, 65102, 65103, 65104, 65105, 65106, 65108, 65109, 65110, 65111, 65112, + 65113, 65114, 65115, 65116, 65117, 65118, 65119, 65120, 65121, 65123, 65128, 65130, + 65131, 65281, 65282, 65283, 65285, 65286, 65287, 65288, 65289, 65290, 65292, 65293, + 65294, 65295, 65306, 65307, 65311, 65312, 65339, 65340, 65341, 65343, 65371, 65373, + 65375, 65376, 65377, 65378, 65379, 65380, 65381, 65792, 65793, 65794, 66463, 66512, + 66927, 67671, 67871, 67903, 68176, 68177, 68178, 68179, 68180, 68181, 68182, 68183, + 68184, 68223, 68336, 68337, 68338, 68339, 68340, 68341, 68342, 68409, 68410, 68411, + 68412, 68413, 68414, 68415, 68505, 68506, 68507, 68508, 69293, 69461, 69462, 69463, + 69464, 69465, 69510, 69511, 69512, 69513, 69703, 69704, 69705, 69706, 69707, 69708, + 69709, 69819, 69820, 69822, 69823, 69824, 69825, 69952, 69953, 69954, 69955, 70004, + 70005, 70085, 70086, 70087, 70088, 70093, 70107, 70109, 70110, 70111, 70200, 70201, + 70202, 70203, 70204, 70205, 70313, 70731, 70732, 70733, 70734, 70735, 70746, 70747, + 70749, 70854, 71105, 71106, 71107, 71108, 71109, 71110, 71111, 71112, 71113, 71114, + 71115, 71116, 71117, 71118, 71119, 71120, 71121, 71122, 71123, 71124, 71125, 71126, + 71127, 71233, 71234, 71235, 71264, 71265, 71266, 71267, 71268, 71269, 71270, 71271, + 71272, 71273, 71274, 71275, 71276, 71353, 71484, 71485, 71486, 71739, 72004, 72005, + 72006, 72162, 72255, 72256, 72257, 72258, 72259, 72260, 72261, 72262, 72346, 72347, + 72348, 72350, 72351, 72352, 72353, 72354, 72448, 72449, 72450, 72451, 72452, 72453, + 72454, 72455, 72456, 72457, 72769, 72770, 72771, 72772, 72773, 72816, 72817, 73463, + 73464, 73539, 73540, 73541, 73542, 73543, 73544, 73545, 73546, 73547, 73548, 73549, + 73550, 73551, 73727, 74864, 74865, 74866, 74867, 74868, 77809, 77810, 92782, 92783, + 92917, 92983, 92984, 92985, 92986, 92987, 92996, 93847, 93848, 93849, 93850, 94178, + 113823, 121479, 121480, 121481, 121482, 121483, 125278, 125279] # fmt: on tbl = dict.fromkeys(punkt)