Skip to content

Commit

Permalink
Speeds up the TSVFile class. When running smaller sample sizes, just …
Browse files Browse the repository at this point in the history
…reading in the large TSV files is the bottleneck.
  • Loading branch information
shorowit committed Jul 13, 2024
1 parent af08e72 commit 56db58d
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 40 deletions.
42 changes: 11 additions & 31 deletions resources/buildstock.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,10 @@ def get_file_data()
option_key = 'Option='
dep_key = 'Dependency='

full_header = nil
rows = []
CSV.foreach(@full_path, col_sep: "\t") do |row|
next if row[0].start_with? "\#"

row.delete_if { |x| x.nil? || (x.size == 0) } # purge trailing empty fields

# Store one header line
if full_header.nil?
full_header = row
next
end

rows << row
end
rows = File.readlines(@full_path).map { |row| row.split("\t") } # don't use CSV class for faster processing of large files
full_header = rows.shift
rows.delete_if { |row| row[0].start_with? "\#" }
rows.map { |row| row.delete_if { |x| x.to_s.empty? } } # purge trailing empty fields

if full_header.nil?
register_error("Could not find header row in #{@filename}.", @runner)
Expand Down Expand Up @@ -65,11 +54,9 @@ def get_file_data()
dependency_cols.each do |dependency, col|
dependency_options[dependency] = []
rows.each do |row|
next if row[0].start_with? "\#"
next if dependency_options[dependency].include? row[col]

dependency_options[dependency] << row[col]
end
dependency_options[dependency].uniq!
end

return rows, option_cols, dependency_cols, dependency_options, full_header, header
Expand All @@ -79,24 +66,20 @@ def cache_data
# Caches data for faster tsv lookups
rows_keys_s = {}
@rows.each_with_index do |row, rownum|
next if row[0].start_with? "\#"

row_key_values = {}
@dependency_cols.keys.each do |dep|
row_key_values[dep] = row[@dependency_cols[dep]]
@dependency_cols.each do |dep, col|
row_key_values[dep] = row[col]
end
key_s = hash_to_string(row_key_values)
key_s_downcase = key_s.downcase

if not rows_keys_s[key_s_downcase].nil?
if not rows_keys_s[row_key_values].nil?
if key_s.size > 0
register_error("Multiple rows found in #{@filename} with dependencies: #{key_s}.", @runner)
else
register_error("Multiple rows found in #{@filename}.", @runner)
end
end

rows_keys_s[key_s_downcase] = rownum
rows_keys_s[row_key_values] = rownum
end
return rows_keys_s
end
Expand All @@ -111,13 +94,10 @@ def get_option_name_from_sample_number(sample_value, dependency_values)
dependency_values = {}
end

key_s = hash_to_string(dependency_values)
key_s_downcase = key_s.downcase

rownum = @rows_keys_s[key_s_downcase]
rownum = @rows_keys_s[dependency_values]
if rownum.nil?
if key_s.size > 0
register_error("Could not determine appropriate option in #{@filename} for sample value #{sample_value} with dependencies: #{key_s}.", @runner)
register_error("Could not determine appropriate option in #{@filename} for sample value #{sample_value} with dependencies: #{hash_to_string(dependency_values)}.", @runner)
else
register_error("Could not determine appropriate option in #{@filename} for sample value #{sample_value}.", @runner)
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,14 +407,11 @@ def run_measure(model, measure, argument_map, runner)
end

def hash_to_string(hash, delim = '=', separator = ',')
hash_s = ''
vals = []
hash.each do |k, v|
hash_s += "#{k}#{delim}#{v}#{separator}"
vals << "#{k}#{delim}#{v}"
end
if hash_s.size > 0
hash_s = hash_s.chomp(separator.to_s)
end
return hash_s
return vals.join(separator.to_s)
end

def register_error(msg, runner = nil)
Expand Down
4 changes: 1 addition & 3 deletions resources/run_sampling_lib.rb
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,7 @@ def get_tsvrow_with_dependency_values(tsvfile, dep_hash)
return tsvfile.rows[0]
end

key_s = hash_to_string(dep_hash)
key_s_downcase = key_s.downcase
rownum = tsvfile.rows_keys_s[key_s_downcase]
rownum = tsvfile.rows_keys_s[dep_hash]

if rownum.nil?
register_error("Could not find row in #{tsvfile.filename} with dependency values: #{dep_hash}.", nil)
Expand Down

0 comments on commit 56db58d

Please sign in to comment.