Skip to content
This repository has been archived by the owner on Dec 13, 2024. It is now read-only.

Commit

Permalink
Fix bug with search inconsistencies b/c of iter chaining
Browse files Browse the repository at this point in the history
The iterator chaining was not implemented correctly. A correctly
implemented solution with iterator chaining does not yield
significant memory benefits. Therefore, an iterator is only used
now in a benefitting circumstance. Otherwise, the lists are copied
for simplicity.
  • Loading branch information
ra1nb0rn committed Nov 29, 2023
1 parent eb3f42c commit 0fffd2b
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions cpe_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import argparse
from collections import Counter
from itertools import chain
import math
import os
import pprint
Expand Down Expand Up @@ -420,34 +419,35 @@ def words_in_line(words, line):
continue

cpe_entry_ids = cpe_entry_ids[0][0].split(',')
all_cpe_entry_ids.append(cpe_entry_ids[0])
all_cpe_entry_ids.append(int(cpe_entry_ids[0]))

for eid in cpe_entry_ids[1:]:
if '-' in eid:
eid = eid.split('-')
all_cpe_entry_ids += list(range(int(eid[0]), int(eid[1])+1))
else:
all_cpe_entry_ids.append(eid)
all_cpe_entry_ids.append(int(eid))

# iterate over all retrieved CPE infos and find best matching CPEs for queries
iterator = []
max_results_per_query = 250000
remaining = len(all_cpe_entry_ids)
is_one_iter_enough = remaining <= max_results_per_query
while remaining > 0:
if remaining > max_results_per_query:
count_params_in_str = max_results_per_query
else:
count_params_in_str = remaining
param_in_str = ('?,' * count_params_in_str)[:-1]
if keep_data_in_memory:
if keep_data_in_memory or not is_one_iter_enough:
db_query = 'SELECT cpe, term_frequencies, abs_term_frequency FROM cpe_entries WHERE entry_id IN (%s)' % param_in_str
cpe_infos = db_cursor.execute(db_query, all_cpe_entry_ids[remaining-count_params_in_str:remaining]).fetchall()
relevant_cpe_infos = cpe_infos
iterator = chain(iterator, relevant_cpe_infos)
iterator += cpe_infos
else:
db_query = 'SELECT cpe, term_frequencies, abs_term_frequency FROM cpe_entries WHERE entry_id IN (%s)' % param_in_str
db_cursor.execute(db_query, all_cpe_entry_ids[remaining-count_params_in_str:remaining])
iterator = chain(iterator, db_cursor)
iterator = db_cursor

remaining -= max_results_per_query

for cpe_info in iterator:
Expand Down

0 comments on commit 0fffd2b

Please sign in to comment.