Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTT reader: fix empty line detection #440

Merged
merged 3 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions src/main/python/ttconv/vtt/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ class _State(Enum):

state = _State.START
current_p = None
subtitle_text = None

for line_index, line in enumerate(_none_terminated(lines)):

Expand Down Expand Up @@ -524,11 +525,17 @@ class _State(Enum):
if state in (_State.TEXT, _State.TEXT_MORE):

if line is None or _EMPTY_RE.fullmatch(line):
subtitle_text = subtitle_text.strip('\r\n').replace(r"\n\r", "\n")

_parse_cue_text(subtitle_text, current_p, line_index)
if subtitle_text is not None:
_parse_cue_text(
subtitle_text.strip('\r\n').replace(r"\n\r", "\n"),
current_p,
line_index
)
else:
LOGGER.warning("Ignoring cue due to a spurious blank line at line %s", line_index)

state = _State.LOOKING
subtitle_text = None
continue

if state is _State.TEXT:
Expand Down
22 changes: 22 additions & 0 deletions src/test/python/test_vtt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,28 @@ def test_blank_lines(self):
f = io.StringIO(SAMPLE)
self.assertIsNotNone(to_model(f))

def test_malformed_blank_lines(self):
# from https://github.com/sandflow/ttconv/issues/439
# the first cue should be ignored since it is malformed
SAMPLE = """WEBVTT
Kind: captions
Language: en

00:00:00.799 --> 00:00:02.869 align:start position:0%

hi<00:00:01.040><c> everyone</c><00:00:01.920><c> today</c><00:00:02.240><c> we're</c><00:00:02.399><c> going</c><00:00:02.639><c> to</c><00:00:02.720><c> be</c>

00:00:02.869 --> 00:00:02.879 align:start position:0%
hi everyone today we're going to be
"""

doc = to_model(io.StringIO(SAMPLE))
self.assertIsNotNone(doc)
body = list(doc.get_body())
self.assertEqual(len(body), 1)
div = list(body[0])
self.assertEqual(len(div), 1)

def test_italic(self):
f = io.StringIO(r"""WEBVTT

Expand Down