Skip to content

Commit 9c0bb3e

Browse files
Merge pull request #64 from francois-drielsma/develop
Truth building + LArCV scripts
2 parents 927f656 + 67bc5fb commit 9c0bb3e

8 files changed

+536
-134
lines changed

bin/larcv_find_duplicates.py

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python3
2+
"""Finds duplicated files."""
3+
4+
import argparse
5+
6+
import numpy as np
7+
from tqdm import tqdm
8+
from ROOT import TFile # pylint: disable=E0611
9+
from larcv import larcv # pylint: disable=W0611
10+
11+
12+
def main(source, source_list, output, tree_name):
13+
"""Loops over a list of files and identifies files which contain the same
14+
set of (run, subrun, event) triplets.
15+
16+
In order to save time, this script only checks if:
17+
1. The number of entries in the files are the same
18+
2. The run, subrun and event numbers in the first entry are the same
19+
20+
Parameters
21+
----------
22+
source : Union[str, List[str]]
23+
Path or list of paths to the input files
24+
source_list : str
25+
Path to a text file containing a list of data file paths
26+
output : str
27+
Path to the output text file with the list of duplicates
28+
tree_name : str
29+
Name of the tree to use as a reference to count the number of entries.
30+
If not specified, takes the first tree in the list.
31+
"""
32+
# If using source list, read it in
33+
if source_list is not None:
34+
with open(source_list, 'r', encoding='utf-8') as f:
35+
source = f.read().splitlines()
36+
37+
# Initialize the output text file
38+
out_file = open(output, 'w', encoding='utf-8')
39+
40+
# Loop over the list of files in the input
41+
print(f"\nGathering information from {len(source)} files:")
42+
values = np.empty((len(source), 4), dtype=int)
43+
for idx, file_path in enumerate(tqdm(source)):
44+
# Get the tree to get the number of entries from
45+
f = TFile(file_path, 'r')
46+
if tree_name is None:
47+
key = [key.GetName() for key in f.GetListOfKeys()][0]
48+
else:
49+
key = f'{tree_name}_tree'
50+
branch_key = key.replace('_tree', '_branch')
51+
52+
# Check the number of entries in the file
53+
tree = getattr(f, key)
54+
num_entries = tree.GetEntries()
55+
56+
# Get the event information of the first entry in the file
57+
tree.GetEntry(0)
58+
branch = getattr(tree, branch_key)
59+
run, subrun, event = branch.run(), branch.subrun(), branch.event()
60+
61+
# Set the values list
62+
values[idx] = [num_entries, run, subrun, event]
63+
64+
# Loop over non-unique files
65+
print(f"\nChecking for duplicates among {len(source)} files:")
66+
_, inverse, counts = np.unique(
67+
values, axis=0, return_inverse=True, return_counts=True)
68+
duplicate_files = []
69+
for idx in tqdm(np.where(counts > 1)[0]):
70+
# Build a file mask for this class of duplicates
71+
index = np.where(inverse == idx)[0]
72+
73+
# All the files which are not the first in this class are duplicates
74+
for i in range(1, len(index)):
75+
file_path = source[index[i]]
76+
duplicate_files.append(file_path)
77+
out_file.write(f'{file_path}\n')
78+
tqdm.write(f"- Duplicate file: {file_path}")
79+
80+
print(f"\nFound {len(duplicate_files)} duplicate files.")
81+
82+
# Close text file
83+
out_file.close()
84+
85+
86+
if __name__ == "__main__":
87+
# Parse the command-line arguments
88+
parser = argparse.ArgumentParser(description="Count entries in dataset")
89+
90+
group = parser.add_mutually_exclusive_group(required=True)
91+
group.add_argument('--source', '-s',
92+
help='Path or list of paths to data files',
93+
type=str, nargs="+")
94+
group.add_argument('--source-list', '-S',
95+
help='Path to a text file of data file paths',
96+
type=str)
97+
98+
parser.add_argument('--output', '-o',
99+
help='Path to the output text file with the duplicate list',
100+
type=str, required=True)
101+
102+
parser.add_argument('--tree_name',
103+
help='TTree name used to count the entries.',
104+
type=str)
105+
106+
args = parser.parse_args()
107+
108+
# Execute the main function
109+
main(args.source, args.source_list, args.output, args.tree_name)

bin/larcv_find_run.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/usr/bin/env python3
2+
"""Builds a list of file which make a data run."""
3+
4+
import argparse
5+
6+
from tqdm import tqdm
7+
from ROOT import TFile # pylint: disable=E0611
8+
from larcv import larcv # pylint: disable=W0611
9+
10+
11+
def main(source, source_list, output, run_number, tree_name):
12+
"""Loops over a list of files and finds those which belong to a certain run.
13+
14+
Parameters
15+
----------
16+
source : Union[str, List[str]]
17+
Path or list of paths to the input files
18+
source_list : str
19+
Path to a text file containing a list of data file paths
20+
output : str
21+
Path to the output text file with the list of run files
22+
run_number : int
23+
Run number to look for
24+
tree_name : str
25+
Name of the tree to use as a reference to get the run number from.
26+
If not specified, takes the first tree in the list.
27+
"""
28+
# If using source list, read it in
29+
if source_list is not None:
30+
with open(source_list, 'r', encoding='utf-8') as f:
31+
source = f.read().splitlines()
32+
33+
# Initialize the output text file
34+
out_file = open(output, 'w', encoding='utf-8')
35+
36+
# Loop over the list of files in the input
37+
print(f"\nLooking for run {run_number} in {len(source)} files:")
38+
run_files = []
39+
for file_path in tqdm(source):
40+
# Get the tree to get the number of entries from
41+
f = TFile(file_path, 'r')
42+
if tree_name is None:
43+
key = [key.GetName() for key in f.GetListOfKeys()][0]
44+
else:
45+
key = f'{tree_name}_tree'
46+
branch_key = key.replace('_tree', '_branch')
47+
48+
# Check the run number of the first entry in the file
49+
tree = getattr(f, key)
50+
tree.GetEntry(0)
51+
run = getattr(tree, branch_key).run()
52+
f.Close()
53+
54+
# If the file contains entries from the correct run, append
55+
if run == run_number:
56+
tqdm.write(f"- Good file: {file_path}")
57+
run_files.append(file_path)
58+
out_file.write(f'{file_path}\n')
59+
60+
print(f"\nFound {len(run_files)} run {run_number} files.")
61+
62+
# Close text file
63+
out_file.close()
64+
65+
66+
if __name__ == "__main__":
67+
# Parse the command-line arguments
68+
parser = argparse.ArgumentParser(description="Count entries in dataset")
69+
70+
group = parser.add_mutually_exclusive_group(required=True)
71+
group.add_argument('--source', '-s',
72+
help='Path or list of paths to data files',
73+
type=str, nargs="+")
74+
group.add_argument('--source-list', '-S',
75+
help='Path to a text file of data file paths',
76+
type=str)
77+
78+
parser.add_argument('--output', '-o',
79+
help='Path to the output text file with the run file list',
80+
type=str, required=True)
81+
82+
parser.add_argument('--run-number',
83+
help='Run number to look for',
84+
type=int, required=True)
85+
86+
parser.add_argument('--tree_name',
87+
help='TTree name used to count the entries.',
88+
type=str)
89+
90+
args = parser.parse_args()
91+
92+
# Execute the main function
93+
main(args.source, args.source_list, args.output, args.run_number,
94+
args.tree_name)

bin/larcv_inject_run_number.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Script which injects a run number in every event of every tree in a file or
2+
a list of files.
3+
"""
4+
5+
import os
6+
import argparse
7+
import tempfile
8+
9+
import numpy as np
10+
from tqdm import tqdm
11+
from ROOT import TFile # pylint: disable=E0611
12+
from larcv import larcv # pylint: disable=W0611
13+
14+
# LArCV IO Manager configuration string
15+
CFG = """
16+
IOManager: {
17+
Verbosity : 4
18+
Name : "OutIO"
19+
IOMode : 2
20+
InputFiles : [INPUT_PATH]
21+
OutFileName : OUTPUT_PATH
22+
}
23+
"""
24+
25+
26+
def initialize_manager(file_path, dest, overwrite, suffix):
27+
"""Initialize an IOManager object given a configuration.
28+
29+
Parameters
30+
----------
31+
file_path : str
32+
Path to the input file
33+
34+
Returns
35+
-------
36+
larcv.IOManager
37+
IOManager object
38+
"""
39+
# If the destination is provided, direct the output file there
40+
out_path = file_path
41+
if dest is not None:
42+
base = os.path.basename(file_path)
43+
out_path = f'{dest}/{base}'
44+
45+
# If a suffix is provided, append
46+
assert suffix is None or not overwrite, (
47+
"No point in providing a suffix if the original file is overwritten.")
48+
if suffix is not None:
49+
out_path = out_path.replace('.root', f'_{suffix}.root')
50+
elif overwrite:
51+
out_path = out_path.replace('.root', '_tmp.root')
52+
53+
# Check that the output file does is not the same as the original file
54+
if file_path == out_path:
55+
raise ValueError(
56+
"The input file name and the output file name are the same. "
57+
"This is not allowed by the LArCV IOManager.")
58+
59+
# Update the configuration with the input/output file names
60+
cfg = CFG
61+
cfg = cfg.replace('INPUT_PATH', file_path)
62+
cfg = cfg.replace('OUTPUT_PATH', out_path)
63+
64+
# Create a temporary text file with the configuration
65+
tmp = tempfile.NamedTemporaryFile('w')
66+
tmp.write(cfg)
67+
tmp.flush()
68+
69+
# Initialize the IOManager
70+
manager = larcv.IOManager(tmp.name)
71+
manager.initialize()
72+
73+
return manager, out_path
74+
75+
76+
def main(source, source_list, dest, overwrite, run_number, suffix):
77+
"""Checks the output of the SPINE process.
78+
79+
The script loops over the input files, fetch the list of keys in the file
80+
and injects a run number of each event in each file.
81+
82+
.. code-block:: bash
83+
84+
$ python3 bin/inject_run_number.py -S file_list.txt
85+
--overwrite --run_number 123
86+
87+
Parameters
88+
----------
89+
source : List[str]
90+
List of paths to the input files
91+
source_list : str
92+
Path to a text file containing a list of data file paths
93+
dest : str
94+
Destination folder to write the files to
95+
overwrite : bool
96+
If `True`, overwrite the original files
97+
run_number : int
98+
Run number to inject in the input file list. If it is specied as -1,
99+
each file is assigned a unique run number
100+
suffix : str
101+
String to append to the end of the input file names to form the name
102+
of the output file with the updated run numbers
103+
"""
104+
# If using source list, read it in
105+
if source_list is not None:
106+
with open(source_list, 'r', encoding='utf-8') as f:
107+
source = f.read().splitlines()
108+
109+
# Initialize the output text file
110+
#out_file = open(output, 'w', encoding='utf-8')
111+
112+
# Loop over the list of files in the input
113+
print("\nUpdating the run numbers of input files.")
114+
for idx, file_path in enumerate(tqdm(source)):
115+
# Initialize the input/output processes
116+
io, out_path = initialize_manager(file_path, dest, overwrite, suffix)
117+
118+
# Loop over entries, set the run number for every data product
119+
num_entries = io.get_n_entries()
120+
run = run_number if run_number > -1 else idx
121+
for e in range(num_entries):
122+
# Read existing content
123+
io.read_entry(e)
124+
125+
# Update the run number
126+
io.set_id(run, 0, e + 1)
127+
128+
# Save
129+
io.save_entry()
130+
131+
# Finalize
132+
io.finalize()
133+
134+
# If needed move the output file to where the
135+
if overwrite:
136+
os.rename(out_path, file_path)
137+
138+
139+
if __name__ == "__main__":
140+
# Parse the command-line arguments
141+
parser = argparse.ArgumentParser(description="Check dataset validity")
142+
143+
group = parser.add_mutually_exclusive_group(required=True)
144+
group.add_argument('--source', '-s',
145+
help='Path or list of paths to data files',
146+
type=str, nargs="+")
147+
group.add_argument('--source-list', '-S',
148+
help='Path to a text file of data file paths',
149+
type=str)
150+
151+
group = parser.add_mutually_exclusive_group(required=True)
152+
group.add_argument('--dest',
153+
help='Destination folder for the output file',
154+
type=str)
155+
group.add_argument('--overwrite',
156+
help='Overwrite the input file with the output file',
157+
action='store_true')
158+
159+
parser.add_argument('--run-number',
160+
help='Run number to assign to every input file',
161+
type=int, required=True)
162+
163+
parser.add_argument('--suffix',
164+
help='Suffix to append to the input file names',
165+
type=str)
166+
167+
args = parser.parse_args()
168+
169+
# Execute the main function
170+
main(args.source, args.source_list, args.dest, args.overwrite,
171+
args.run_number, args.suffix)

0 commit comments

Comments
 (0)