From 1a118e02a7b347830f1023b2f530cf9c1f1636da Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 5 Mar 2024 09:02:49 +1300 Subject: [PATCH] Fix reading truncated data when the next segment offset is set (#326) --- nptdms/reader.py | 50 +++++++++++++++++++++++------------ nptdms/test/test_tdms_file.py | 44 ++++++++++++++++++++++++++++++ nptdms/test/util.py | 7 +++-- 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/nptdms/reader.py b/nptdms/reader.py index b64d27e..12f284b 100644 --- a/nptdms/reader.py +++ b/nptdms/reader.py @@ -70,6 +70,11 @@ def __init__(self, tdms_file): self._index_file_path = filepath self._index_file = open(self._index_file_path, "rb") + if self._file is not None: + self._data_file_size = _get_file_size(self._file) + else: + self._data_file_size = None + def close(self): if self._file is None and self._index_file is None: # Already closed @@ -314,21 +319,31 @@ def _read_lead_in(self, file, segment_position, is_index_file=False): segment_incomplete = next_segment_offset == 0xFFFFFFFFFFFFFFFF if segment_incomplete: # Segment size is unknown. This can happen if LabVIEW crashes. - next_segment_pos = self._get_data_file_size() + next_segment_pos = self._data_file_size + else: + next_segment_pos = ( + segment_position + next_segment_offset + lead_size) + if self._data_file_size is not None and next_segment_pos > self._data_file_size: + # The raw data offset is incorrect, and there is less data than expected in this segment + next_segment_pos = self._data_file_size + segment_incomplete = True + + if segment_incomplete: if next_segment_pos < data_position: # Metadata wasn't completely written and don't have any data in this segment, # don't try to read any metadata log.warning("Last segment metadata is incomplete") raise EOFError - # Try to read until the end of the file if we have complete metadata - log.warning( - "Last segment of file has unknown size, " - "will attempt to read to the end of the file") - else: - log.debug("Next segment offset = %d, raw data offset = %d, data size = %d b", - next_segment_offset, raw_data_offset, next_segment_offset - raw_data_offset) - next_segment_pos = ( - segment_position + next_segment_offset + lead_size) + else: + # Try to read until the end of the file if we have complete metadata + log.warning( + "Last segment of file has less data than expected, " + "will attempt to read to the end of the file") + + log.debug("Next segment offset = %d, raw data offset = %d, expected data size = %d b, actual data size = %d b", + next_segment_offset, raw_data_offset, + next_segment_offset - raw_data_offset, + next_segment_pos - data_position) return segment_position, toc_mask, data_position, next_segment_pos, segment_incomplete @@ -346,13 +361,6 @@ def _verify_segment_start(self, segment): position) + "Check that the tdms_index file matches the tdms data file.") - def _get_data_file_size(self): - current_pos = self._file.tell() - self._file.seek(0, os.SEEK_END) - end_pos = self._file.tell() - self._file.seek(current_pos, os.SEEK_SET) - return end_pos - def _update_object_metadata(self, segment): """ Update object metadata using the metadata read from a single segment """ @@ -509,3 +517,11 @@ def _array_equal(a, b, chunk_size=100): if not (a[offset:offset+chunk_size] == b[offset:offset+chunk_size]).all(): return False return True + + +def _get_file_size(file): + current_pos = file.tell() + file.seek(0, os.SEEK_END) + end_pos = file.tell() + file.seek(current_pos, os.SEEK_SET) + return end_pos diff --git a/nptdms/test/test_tdms_file.py b/nptdms/test/test_tdms_file.py index 956a430..e4b1cc6 100644 --- a/nptdms/test/test_tdms_file.py +++ b/nptdms/test/test_tdms_file.py @@ -482,6 +482,20 @@ def test_read_with_index_file(test_file, expected_data): compare_arrays(channel_obj.data, expected_channel_data) +def test_read_index_file_only(): + """ Test reading the index file directly + """ + test_file, expected_data = scenarios.single_segment_with_two_channels().values + with test_file.get_tempfile_with_index() as tdms_file_path: + with TdmsFile.open(tdms_file_path + "_index") as tdms_file: + for ((group, channel), expected_channel_data) in expected_data.items(): + channel_obj = tdms_file[group][channel] + assert len(channel_obj) == len(expected_channel_data) + with pytest.raises(RuntimeError) as exc_info: + channel_obj[:] + assert "Data cannot be read from index file only" in str(exc_info.value) + + @pytest.mark.skipif(sys.version_info < (3, 4), reason="pathlib only available in stdlib since 3.4") def test_read_file_passed_as_pathlib_path(): """ Test reading a file when using a pathlib Path object @@ -764,6 +778,36 @@ def test_incomplete_segment_with_string_data(): assert len(channel) == 0 +def test_truncated_interleaved_data(): + """ + Test when a segment is truncated within a row of interleaved data, + and the next segment offset is set but is beyond the end of the file. + """ + test_file = GeneratedFile() + test_file.add_segment( + ("kTocMetaData", "kTocRawData", "kTocNewObjList", "kTocInterleavedData"), + segment_objects_metadata( + channel_metadata("/'group'/'channel1'", 3, 4), + channel_metadata("/'group'/'channel2'", 3, 4), + ), + "01 00 00 00" "02 00 00 00" + "03 00 00 00" "04 00 00 00" + "05 00 00 00" "06 00 00 00" + "07 00 00 00", + data_size_override=4 * 2 * 4 + ) + with test_file.get_tempfile() as temp_file: + with TdmsFile.open(temp_file.file) as tdms_file: + group = tdms_file['group'] + chan1 = group['channel1'] + chan2 = group['channel2'] + for chan in [chan1, chan2]: + chan_data = chan[:] + assert chan[-1] == chan_data[-1] + assert len(chan) == 3 + assert len(chan_data) == 3 + + def test_truncated_metadata_in_last_segment(): """ Test the scenario where writing the file was aborted with part of the metadata written """ diff --git a/nptdms/test/util.py b/nptdms/test/util.py index a2c4b6d..7703965 100644 --- a/nptdms/test/util.py +++ b/nptdms/test/util.py @@ -223,7 +223,9 @@ class GeneratedFile(object): def __init__(self): self._content = [] - def add_segment(self, toc, metadata, data, incomplete=False, binary_data=False, version=4713): + def add_segment( + self, toc, metadata, data, incomplete=False, binary_data=False, version=4713, + data_size_override=None): metadata_bytes = _hex_to_bytes(metadata) data_bytes = data if binary_data else _hex_to_bytes(data) if toc is not None: @@ -246,7 +248,8 @@ def add_segment(self, toc, metadata, data, incomplete=False, binary_data=False, raise ValueError("Unrecognised TOC value: %s" % toc_item) lead_in += struct.pack('