From ff75c944e7cf653fae0ccf2e35a6d16a3781b302 Mon Sep 17 00:00:00 2001 From: Pierre-Anthony Lemieux Date: Fri, 7 Feb 2025 12:41:42 -0800 Subject: [PATCH 1/3] VTT reader: avoid crashing on malformed VTT cue --- src/main/python/ttconv/vtt/reader.py | 13 ++++++++++--- src/test/python/test_vtt_reader.py | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/main/python/ttconv/vtt/reader.py b/src/main/python/ttconv/vtt/reader.py index 90f001f6..ad0728cd 100644 --- a/src/main/python/ttconv/vtt/reader.py +++ b/src/main/python/ttconv/vtt/reader.py @@ -451,6 +451,7 @@ class _State(Enum): state = _State.START current_p = None + subtitle_text = None for line_index, line in enumerate(_none_terminated(lines)): @@ -524,11 +525,17 @@ class _State(Enum): if state in (_State.TEXT, _State.TEXT_MORE): if line is None or _EMPTY_RE.fullmatch(line): - subtitle_text = subtitle_text.strip('\r\n').replace(r"\n\r", "\n") - - _parse_cue_text(subtitle_text, current_p, line_index) + if subtitle_text is not None: + _parse_cue_text( + subtitle_text.strip('\r\n').replace(r"\n\r", "\n"), + current_p, + line_index + ) + else: + LOGGER.warning("Ignoring cue due to a spurious blank line at line %s", line_index) state = _State.LOOKING + subtitle_text = None continue if state is _State.TEXT: diff --git a/src/test/python/test_vtt_reader.py b/src/test/python/test_vtt_reader.py index cf5e81b8..677a3a72 100644 --- a/src/test/python/test_vtt_reader.py +++ b/src/test/python/test_vtt_reader.py @@ -97,6 +97,28 @@ def test_blank_lines(self): f = io.StringIO(SAMPLE) self.assertIsNotNone(to_model(f)) + def test_malformed_blank_lines(self): + # from https://github.com/sandflow/ttconv/issues/439 + # the first cue should be ignored since it is malformed + SAMPLE = """WEBVTT +Kind: captions +Language: en + +00:00:00.799 --> 00:00:02.869 align:start position:0% + +hi<00:00:01.040> everyone<00:00:01.920> today<00:00:02.240> we're<00:00:02.399> going<00:00:02.639> to<00:00:02.720> be + +00:00:02.869 --> 00:00:02.879 align:start position:0% +hi everyone today we're going to be +""" + + doc = to_model(io.StringIO(SAMPLE)) + self.assertIsNotNone(doc) + body = list(doc.get_body()) + self.assertEqual(len(body), 1) + div = list(body[0]) + self.assertEqual(len(div), 1) + def test_italic(self): f = io.StringIO(r"""WEBVTT From 4652bf541b7cb50bc5d3a33c86b0f66681c81da7 Mon Sep 17 00:00:00 2001 From: Pierre-Anthony Lemieux Date: Mon, 10 Feb 2025 10:28:21 -0800 Subject: [PATCH 2/3] Fix empty line check --- src/main/python/ttconv/vtt/reader.py | 6 ++---- src/test/python/test_vtt_reader.py | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/main/python/ttconv/vtt/reader.py b/src/main/python/ttconv/vtt/reader.py index ad0728cd..86e1e54d 100644 --- a/src/main/python/ttconv/vtt/reader.py +++ b/src/main/python/ttconv/vtt/reader.py @@ -191,7 +191,7 @@ def _make_span(self, parent: model.ContentElement) -> model.Span: return span -_EMPTY_RE = re.compile(r"\s+") +_EMPTY_RE = re.compile(r"[\n\r]*") _DEFAULT_FONT_STACK = (styles.GenericFontFamilyType.sansSerif,) _DEFAULT_FONT_SIZE = styles.LengthType(15 * 5, styles.LengthType.Units.pct) # 5vh for ttp:cellResolution="32 15" _DEFAULT_TEXT_COLOR = styles.NamedColors.white.value @@ -451,7 +451,6 @@ class _State(Enum): state = _State.START current_p = None - subtitle_text = None for line_index, line in enumerate(_none_terminated(lines)): @@ -519,7 +518,7 @@ class _State(Enum): current_p.set_region(_get_or_make_region(doc, cue_params[3:])) state = _State.TEXT - + subtitle_text = None continue if state in (_State.TEXT, _State.TEXT_MORE): @@ -535,7 +534,6 @@ class _State(Enum): LOGGER.warning("Ignoring cue due to a spurious blank line at line %s", line_index) state = _State.LOOKING - subtitle_text = None continue if state is _State.TEXT: diff --git a/src/test/python/test_vtt_reader.py b/src/test/python/test_vtt_reader.py index 677a3a72..013932c8 100644 --- a/src/test/python/test_vtt_reader.py +++ b/src/test/python/test_vtt_reader.py @@ -119,6 +119,28 @@ def test_malformed_blank_lines(self): div = list(body[0]) self.assertEqual(len(div), 1) + def test_single_line_with_space(self): + # from https://github.com/sandflow/ttconv/issues/439 + # the first cue is not ignored since the first line contains a single space + SAMPLE = """WEBVTT +Kind: captions +Language: en + +00:00:00.799 --> 00:00:02.869 align:start position:0% + +hi<00:00:01.040> everyone<00:00:01.920> today<00:00:02.240> we're<00:00:02.399> going<00:00:02.639> to<00:00:02.720> be + +00:00:02.869 --> 00:00:02.879 align:start position:0% +hi everyone today we're going to be +""" + + doc = to_model(io.StringIO(SAMPLE)) + self.assertIsNotNone(doc) + body = list(doc.get_body()) + self.assertEqual(len(body), 1) + div = list(body[0]) + self.assertEqual(len(div), 2) + def test_italic(self): f = io.StringIO(r"""WEBVTT From 2389976842efc1b73cb990d3ddffc83eab4cee09 Mon Sep 17 00:00:00 2001 From: Pierre-Anthony Lemieux Date: Tue, 11 Feb 2025 08:17:21 -0800 Subject: [PATCH 3/3] Fix empty line check --- src/test/python/test_vtt_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/python/test_vtt_reader.py b/src/test/python/test_vtt_reader.py index 013932c8..36161578 100644 --- a/src/test/python/test_vtt_reader.py +++ b/src/test/python/test_vtt_reader.py @@ -127,7 +127,7 @@ def test_single_line_with_space(self): Language: en 00:00:00.799 --> 00:00:02.869 align:start position:0% - +\x20 hi<00:00:01.040> everyone<00:00:01.920> today<00:00:02.240> we're<00:00:02.399> going<00:00:02.639> to<00:00:02.720> be 00:00:02.869 --> 00:00:02.879 align:start position:0%