Skip to content

Commit 7d4e0b9

Browse files
Improve read performance when reading data subsets with a large number of channels (#342)
1 parent 88ea672 commit 7d4e0b9

File tree

2 files changed

+51
-22
lines changed

2 files changed

+51
-22
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ coverage.xml
1313
.ipynb_checkpoints
1414
.vscode
1515
*.ipynb
16+
.idea
1617

1718
# Wercker directories
1819
_builds

nptdms/tdms_segment.py

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ class TdmsSegment(object):
4545
'final_chunk_lengths_override',
4646
'object_index',
4747
'segment_incomplete',
48+
'has_daqmx_objects_cached',
49+
'chunk_size_cached',
50+
'data_objects_cached',
4851
]
4952

5053
def __init__(self, position, toc_mask, next_segment_pos, data_position, segment_incomplete):
@@ -57,6 +60,9 @@ def __init__(self, position, toc_mask, next_segment_pos, data_position, segment_
5760
self.ordered_objects = None
5861
self.object_index = None
5962
self.segment_incomplete = segment_incomplete
63+
self.has_daqmx_objects_cached = None
64+
self.chunk_size_cached = None
65+
self.data_objects_cached = None
6066

6167
def __repr__(self):
6268
return "<TdmsSegment at position %d>" % self.position
@@ -135,6 +141,7 @@ def read_segment_objects(self, file, previous_segment_objects, index_cache, prev
135141

136142
if index_cache is not None:
137143
self.object_index = index_cache.get_index(self.ordered_objects)
144+
138145
self._calculate_chunks()
139146
return properties
140147

@@ -261,18 +268,18 @@ def read_raw_data_for_channel(self, f, channel_path, chunk_offset=0, num_chunks=
261268

262269
f.seek(self.data_position)
263270

264-
data_objects = [o for o in self.ordered_objects if o.has_data]
265271
chunk_size = self._get_chunk_size()
266272

267273
# Ensure we're working with Python ints as np.int32 values could overflow
268274
# (https://github.com/adamreeve/npTDMS/issues/338)
269-
chunk_size = int(chunk_size)
270275
chunk_offset = int(chunk_offset)
271276

272277
if chunk_offset > 0:
273278
f.seek(chunk_size * chunk_offset, os.SEEK_CUR)
274279
stop_chunk = self.num_chunks if num_chunks is None else num_chunks + chunk_offset
275-
for chunk in self._read_channel_data_chunks(f, data_objects, channel_path, chunk_offset, stop_chunk):
280+
for chunk in self._read_channel_data_chunks(
281+
f, self._get_data_objects(), channel_path, chunk_offset, stop_chunk, chunk_size
282+
):
276283
yield chunk
277284

278285
def _calculate_chunks(self):
@@ -351,11 +358,15 @@ def _new_segment_object(self, object_path, raw_data_index_header):
351358
return TdmsSegmentObject(object_path)
352359

353360
def _get_chunk_size(self):
361+
if self.chunk_size_cached is not None:
362+
return self.chunk_size_cached
363+
354364
if self._have_daqmx_objects():
355-
return get_daqmx_chunk_size(self.ordered_objects)
356-
return sum(
357-
o.data_size
358-
for o in self.ordered_objects if o.has_data)
365+
self.chunk_size_cached = int(get_daqmx_chunk_size(self.ordered_objects))
366+
return self.chunk_size_cached
367+
368+
self.chunk_size_cached = int(sum(o.data_size for o in self.ordered_objects if o.has_data))
369+
return self.chunk_size_cached
359370

360371
def _read_data_chunks(self, file, data_objects, num_chunks):
361372
""" Read multiple data chunks at once
@@ -365,13 +376,17 @@ def _read_data_chunks(self, file, data_objects, num_chunks):
365376
for chunk in reader.read_data_chunks(file, data_objects, num_chunks):
366377
yield chunk
367378

368-
def _read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk):
379+
def _read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk, chunk_size):
369380
""" Read multiple data chunks for a single channel at once
370381
In the base case we read each chunk individually but subclasses can override this
371382
"""
372383
reader = self._get_data_reader()
373-
for chunk in reader.read_channel_data_chunks(file, data_objects, channel_path, chunk_offset, stop_chunk):
384+
initial_position = file.tell()
385+
for i, chunk in enumerate(reader.read_channel_data_chunks(
386+
file, data_objects, channel_path, chunk_offset, stop_chunk
387+
)):
374388
yield chunk
389+
file.seek(initial_position + (i + 1) * chunk_size)
375390

376391
def _get_data_reader(self):
377392
endianness = '>' if (self.toc_mask & toc_properties['kTocBigEndian']) else '<'
@@ -383,6 +398,9 @@ def _get_data_reader(self):
383398
return ContiguousDataReader(self.num_chunks, self.final_chunk_lengths_override, endianness)
384399

385400
def _have_daqmx_objects(self):
401+
if self.has_daqmx_objects_cached is not None:
402+
return self.has_daqmx_objects_cached
403+
386404
data_obj_count = 0
387405
daqmx_count = 0
388406
for o in self.ordered_objects:
@@ -391,12 +409,12 @@ def _have_daqmx_objects(self):
391409
if isinstance(o, DaqmxSegmentObject):
392410
daqmx_count += 1
393411
if daqmx_count == 0:
394-
return False
395-
if daqmx_count == data_obj_count:
396-
return True
397-
if daqmx_count > 0:
412+
self.has_daqmx_objects_cached = False
413+
elif daqmx_count == data_obj_count:
414+
self.has_daqmx_objects_cached = True
415+
elif daqmx_count > 0:
398416
raise Exception("Cannot read mixed DAQmx and non-DAQmx data")
399-
return False
417+
return self.has_daqmx_objects_cached
400418

401419
def _have_interleaved_data(self):
402420
""" Whether data in this segment is interleaved. Assumes data is not DAQmx.
@@ -420,6 +438,13 @@ def _have_interleaved_data(self):
420438
else:
421439
raise ValueError("Cannot read interleaved segment containing channels with unsized types")
422440

441+
def _get_data_objects(self):
442+
if self.data_objects_cached is not None:
443+
return self.data_objects_cached
444+
445+
self.data_objects_cached = [o for o in self.ordered_objects if o.has_data]
446+
return self.data_objects_cached
447+
423448

424449
class InterleavedDataReader(BaseDataReader):
425450
""" Reads data in a TDMS segment with interleaved data
@@ -492,24 +517,27 @@ def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path
492517
""" Read data from a chunk for a single channel
493518
"""
494519
channel_data = RawChannelDataChunk.empty()
520+
current_position = file.tell()
495521
for obj in data_objects:
496522
number_values = self._get_channel_number_values(obj, chunk_index)
497523
if obj.path == channel_path:
524+
file.seek(current_position)
498525
channel_data = RawChannelDataChunk.channel_data(obj.read_values(file, number_values, self.endianness))
526+
current_position = file.tell()
527+
break
499528
elif number_values == obj.number_values:
500529
# Seek over data for other channel data
501-
file.seek(obj.data_size, os.SEEK_CUR)
502-
else:
530+
current_position += obj.data_size
531+
elif obj.data_type.size is not None:
503532
# In last chunk with reduced chunk size
504-
if obj.data_type.size is None:
505-
# Type is unsized (eg. string), try reading number of values
506-
obj.read_values(file, number_values, self.endianness)
507-
else:
508-
file.seek(obj.data_type.size * number_values, os.SEEK_CUR)
533+
current_position += obj.data_type.size * number_values
534+
else:
535+
raise Exception("Cannot skip over channel with unsized type in a truncated segment")
536+
509537
return channel_data
510538

511539
def _get_channel_number_values(self, obj, chunk_index):
512-
if chunk_index == (self.num_chunks - 1) and self.final_chunk_lengths_override is not None:
540+
if self.final_chunk_lengths_override is not None and chunk_index == (self.num_chunks - 1):
513541
return self.final_chunk_lengths_override.get(obj.path, 0)
514542
else:
515543
return obj.number_values

0 commit comments

Comments
 (0)