Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reading truncated data when the next segment offset is set #326

Merged
merged 3 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 33 additions & 17 deletions nptdms/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ def __init__(self, tdms_file):
self._index_file_path = filepath
self._index_file = open(self._index_file_path, "rb")

if self._file is not None:
self._data_file_size = _get_file_size(self._file)
else:
self._data_file_size = None

def close(self):
if self._file is None and self._index_file is None:
# Already closed
Expand Down Expand Up @@ -314,21 +319,31 @@ def _read_lead_in(self, file, segment_position, is_index_file=False):
segment_incomplete = next_segment_offset == 0xFFFFFFFFFFFFFFFF
if segment_incomplete:
# Segment size is unknown. This can happen if LabVIEW crashes.
next_segment_pos = self._get_data_file_size()
next_segment_pos = self._data_file_size
else:
next_segment_pos = (
segment_position + next_segment_offset + lead_size)
if self._data_file_size is not None and next_segment_pos > self._data_file_size:
# The raw data offset is incorrect, and there is less data than expected in this segment
next_segment_pos = self._data_file_size
segment_incomplete = True

if segment_incomplete:
if next_segment_pos < data_position:
# Metadata wasn't completely written and don't have any data in this segment,
# don't try to read any metadata
log.warning("Last segment metadata is incomplete")
raise EOFError
# Try to read until the end of the file if we have complete metadata
log.warning(
"Last segment of file has unknown size, "
"will attempt to read to the end of the file")
else:
log.debug("Next segment offset = %d, raw data offset = %d, data size = %d b",
next_segment_offset, raw_data_offset, next_segment_offset - raw_data_offset)
next_segment_pos = (
segment_position + next_segment_offset + lead_size)
else:
# Try to read until the end of the file if we have complete metadata
log.warning(
"Last segment of file has less data than expected, "
"will attempt to read to the end of the file")

log.debug("Next segment offset = %d, raw data offset = %d, expected data size = %d b, actual data size = %d b",
next_segment_offset, raw_data_offset,
next_segment_offset - raw_data_offset,
next_segment_pos - data_position)

return segment_position, toc_mask, data_position, next_segment_pos, segment_incomplete

Expand All @@ -346,13 +361,6 @@ def _verify_segment_start(self, segment):
position) +
"Check that the tdms_index file matches the tdms data file.")

def _get_data_file_size(self):
current_pos = self._file.tell()
self._file.seek(0, os.SEEK_END)
end_pos = self._file.tell()
self._file.seek(current_pos, os.SEEK_SET)
return end_pos

def _update_object_metadata(self, segment):
""" Update object metadata using the metadata read from a single segment
"""
Expand Down Expand Up @@ -509,3 +517,11 @@ def _array_equal(a, b, chunk_size=100):
if not (a[offset:offset+chunk_size] == b[offset:offset+chunk_size]).all():
return False
return True


def _get_file_size(file):
current_pos = file.tell()
file.seek(0, os.SEEK_END)
end_pos = file.tell()
file.seek(current_pos, os.SEEK_SET)
return end_pos
44 changes: 44 additions & 0 deletions nptdms/test/test_tdms_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,20 @@ def test_read_with_index_file(test_file, expected_data):
compare_arrays(channel_obj.data, expected_channel_data)


def test_read_index_file_only():
""" Test reading the index file directly
"""
test_file, expected_data = scenarios.single_segment_with_two_channels().values
with test_file.get_tempfile_with_index() as tdms_file_path:
with TdmsFile.open(tdms_file_path + "_index") as tdms_file:
for ((group, channel), expected_channel_data) in expected_data.items():
channel_obj = tdms_file[group][channel]
assert len(channel_obj) == len(expected_channel_data)
with pytest.raises(RuntimeError) as exc_info:
channel_obj[:]
assert "Data cannot be read from index file only" in str(exc_info.value)


@pytest.mark.skipif(sys.version_info < (3, 4), reason="pathlib only available in stdlib since 3.4")
def test_read_file_passed_as_pathlib_path():
""" Test reading a file when using a pathlib Path object
Expand Down Expand Up @@ -764,6 +778,36 @@ def test_incomplete_segment_with_string_data():
assert len(channel) == 0


def test_truncated_interleaved_data():
"""
Test when a segment is truncated within a row of interleaved data,
and the next segment offset is set but is beyond the end of the file.
"""
test_file = GeneratedFile()
test_file.add_segment(
("kTocMetaData", "kTocRawData", "kTocNewObjList", "kTocInterleavedData"),
segment_objects_metadata(
channel_metadata("/'group'/'channel1'", 3, 4),
channel_metadata("/'group'/'channel2'", 3, 4),
),
"01 00 00 00" "02 00 00 00"
"03 00 00 00" "04 00 00 00"
"05 00 00 00" "06 00 00 00"
"07 00 00 00",
data_size_override=4 * 2 * 4
)
with test_file.get_tempfile() as temp_file:
with TdmsFile.open(temp_file.file) as tdms_file:
group = tdms_file['group']
chan1 = group['channel1']
chan2 = group['channel2']
for chan in [chan1, chan2]:
chan_data = chan[:]
assert chan[-1] == chan_data[-1]
assert len(chan) == 3
assert len(chan_data) == 3


def test_truncated_metadata_in_last_segment():
""" Test the scenario where writing the file was aborted with part of the metadata written
"""
Expand Down
7 changes: 5 additions & 2 deletions nptdms/test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,9 @@ class GeneratedFile(object):
def __init__(self):
self._content = []

def add_segment(self, toc, metadata, data, incomplete=False, binary_data=False, version=4713):
def add_segment(
self, toc, metadata, data, incomplete=False, binary_data=False, version=4713,
data_size_override=None):
metadata_bytes = _hex_to_bytes(metadata)
data_bytes = data if binary_data else _hex_to_bytes(data)
if toc is not None:
Expand All @@ -246,7 +248,8 @@ def add_segment(self, toc, metadata, data, incomplete=False, binary_data=False,
raise ValueError("Unrecognised TOC value: %s" % toc_item)
lead_in += struct.pack('<i', toc_mask)
lead_in += struct.pack('<l', version)
next_segment_offset = len(metadata_bytes) + len(data_bytes)
data_len = data_size_override if data_size_override is not None else len(data_bytes)
next_segment_offset = len(metadata_bytes) + data_len
raw_data_offset = len(metadata_bytes)
if incomplete:
lead_in += _hex_to_bytes('FF' * 8)
Expand Down
Loading