forked from hartogss/cxx-langstat
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRunner.cpp
339 lines (309 loc) · 14.6 KB
/
Runner.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#include <iostream>
#include <future>
#include <filesystem>
#include "llvm/Support/CommandLine.h"
#include "cxx-langstat/Options.h"
#include "cxx-langstat/Driver.h"
using StringRef = llvm::StringRef;
using namespace clang::tooling;
using namespace llvm;
namespace fs = std::filesystem;
//-----------------------------------------------------------------------------
// Global variables
// Options in CLI specific to cxx-langstat
llvm::cl::OptionCategory CXXLangstatCategory("cxx-langstat options", "");
llvm::cl::OptionCategory IOCategory("cxx-langstat i/o options", "");
// llvm::cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
// llvm::cl::extrahelp MoreHelp("\nMore help text coming soon...\n");
// CL options
// Accepts comma-separated string of analyses
llvm::cl::opt<std::string> AnalysesOption(
"analyses",
llvm::cl::desc("Comma-separated list of analyses"),
llvm::cl::cat(CXXLangstatCategory));
// 2 flags:
// --emit-features: analysis stops after writing features to file after reading in .ast
// --emit-statistics: read in JSON with features and compute statistics
// FIXME: if none of the above flags are given, emit features and statistics
// in one go.
llvm::cl::opt<Stage> PipelineStage(
llvm::cl::desc("Stage: "),
llvm::cl::values(
clEnumValN(emit_features, "emit-features",
"Reads in C/C++ source or AST files and\n "
"stops after emitting features found and\n "
"stores them in a JSON file. For each \n "
"input file, a file containing the output\n "
"will be created."),
clEnumValN(emit_statistics, "emit-statistics",
"Read in JSON files generated by -emit-features\n "
"and outputs a single JSON file containing\n "
"statistics.")),
llvm::cl::cat(CXXLangstatCategory));
// --in: use this when running on a single file
llvm::cl::list<std::string> InputFilesOption(
"in",
llvm::cl::desc("<src0> [... <srcN>], where srcI can be either a C/C++ source "
"or AST file"),
llvm::cl::ValueRequired,
llvm::cl::ZeroOrMore,
llvm::cl::cat(IOCategory));
// --out: use when single output is expected (--in used or --emit-statistics)
llvm::cl::opt<std::string> OutputFileOption(
"out",
llvm::cl::desc("[<json>]. Use this option if you expect there "
"to be only a single \noutput file. "
"Notably, use this when -emit-statistics is enabled."),
llvm::cl::ValueRequired,
llvm::cl::cat(IOCategory));
// --indir: run tool on a directory. can always be used.
llvm::cl::opt<std::string> InputDirOption(
"indir",
llvm::cl::desc("<dir>. Use this option if you want to analyze multiple files."),
llvm::cl::ValueRequired,
llvm::cl::cat(IOCategory));
// --outdir: use when multiple output files are expcted (--indir or multiple --in used)
llvm::cl::opt<std::string> OutputDirOption(
"outdir",
llvm::cl::desc("<dir>. Use this option if you expect there to be multiple "
"output files, \ne.g. when -emit-features is used on multiple "
"files or a directory"),
llvm::cl::ValueRequired,
llvm::cl::cat(IOCategory));
// Give build path for compilation database
// what to do with this? some -p option already there by default, but parser fails on it
llvm::cl::opt<std::string> BuildPath(
"p",
llvm::cl::desc("Build path to dir containing compilation database in JSON format."),
llvm::cl::Optional, llvm::cl::cat(CXXLangstatCategory));
// Option to run --emit-features in parallel. Will garble terminal output.
llvm::cl::opt<unsigned> ParallelismOption(
"parallel",
llvm::cl::desc("Number of threads to use for -emit-features."),
llvm::cl::init(1),
llvm::cl::Optional,
llvm::cl::ValueRequired,
llvm::cl::cat(CXXLangstatCategory));
llvm::cl::alias ParallelismOptionAlias(
"j",
llvm::cl::desc("Alias for -parallel"),
llvm::cl::aliasopt(ParallelismOption),
llvm::cl::NotHidden,
llvm::cl::cat(CXXLangstatCategory));
//-----------------------------------------------------------------------------
bool isSuitableExtension(llvm::StringRef s, Stage Stage){
if(Stage == emit_features) {
return s.equals(".ast"); // TODO: change back to the other equals
// return s.equals(".cpp") || s.equals(".cc") || s.equals(".cxx")
// || s.equals(".C")
// || s.equals(".hpp") || s.equals(".hh") || s.equals(".hxx")
// || s.equals(".H")
// || s.equals(".c++") || s.equals(".h++")
// || s.equals(".c") || s.equals(".h") // C file formats
// || s.equals(".ast"); // AST file
} else if(Stage == emit_statistics){
return s.equals(".json");
}
return false;
}
std::vector<std::string> getFiles(const std::string& Directory, Stage Stage){
std::vector<std::string> res;
for (const auto& entry : fs::recursive_directory_iterator(Directory)) {
if (entry.is_regular_file()) {
if (isSuitableExtension(llvm::sys::path::extension(entry.path().string()), Stage)) {
res.emplace_back(entry.path().string());
}
}
}
return res;
}
void copyDirectoryStructure(const fs::path& source, const fs::path& destination) {
if (!fs::exists(source) || !fs::is_directory(source)) {
std::cout << "Source directory does not exist or is not a directory\n";
return;
}
// Create the destination directory if it doesn't exist
if (!fs::exists(destination)) {
fs::create_directories(destination);
}
// Recursively iterate through the source directory
for (const auto& entry : fs::recursive_directory_iterator(source)) {
if (entry.is_directory()) {
// Compute the relative path and create corresponding directory in the destination
fs::path relativePath = fs::relative(entry.path(), source);
fs::path newDir = destination / relativePath;
fs::create_directories(newDir);
// std::cout << "Created directory: " << newDir << '\n';
}
}
}
//-----------------------------------------------------------------------------
// Runs 'Jobs' parallel instances of CXXLangstatMain when -emit-features is enabled
int ParallelEmitFeatures(std::vector<std::string> InputFiles,
std::vector<std::string> OutputFiles, Stage stage, std::string Analyses,
std::string BuildPath,
std::shared_ptr<clang::tooling::CompilationDatabase> db,
unsigned Jobs){
Jobs = std::min(Jobs, static_cast<unsigned>(InputFiles.size()));
Jobs = std::max(Jobs, 1U);
if(Jobs == 1)
return CXXLangstatMain(InputFiles, OutputFiles, PipelineStage,
Analyses, BuildPath, db);
// Function assumes InputFiles.size = OutputFiles.size
unsigned Files = InputFiles.size();
// if(Jobs > Files)
// Jobs = Files;
// unsigned WorkPerJob = Files/Jobs;
// unsigned JobsWith1Excess = Files%Jobs;
// unsigned b = 0;
// Need to store futures from std::async in vector, otherwise ~future
// blocks parallelism.
// https://stackoverflow.com/questions/36920579/how-to-use-stdasync-to-call-a-function-in-a-mutex-protected-loop
std::vector<std::future<int>> futures;
std::vector<
std::vector<std::string> > futures_InputFiles(Jobs);
std::vector<
std::vector<std::string> > futures_OutputFiles(Jobs);
for (unsigned idx = 0; idx < InputFiles.size(); idx++){
futures_InputFiles[idx % Jobs].push_back(InputFiles[idx]);
futures_OutputFiles[idx % Jobs].push_back(OutputFiles[idx]);
}
for(unsigned idx=0; idx < Jobs; idx++){
auto r = std::async(std::launch::async, CXXLangstatMain, futures_InputFiles[idx],
futures_OutputFiles[idx], stage, Analyses, BuildPath, db); // pipeline stage was the problem for async, probably because still 'unparsed'
futures.emplace_back(std::move(r));
}
// get the result of all the futures
if (static_cast<unsigned>(std::count_if(futures.begin(), futures.end(), [](std::future<int>& f) {
return f.get() == 0;
})) != futures.size()) {
std::cout << "Some futures failed" << std::endl;
return 1;
}
return 0;
}
//-----------------------------------------------------------------------------
//
int main(int argc, char** argv){
// Common parser for command line options, provided by llvm
// CommonOptionsParser Parser(argc, argv, CXXLangstatCategory);
// const std::vector<std::string>& spl = Parser.getSourcePathList();
// CompilationDatabase& db = Parser.getCompilations();
// I don't like the way input/source files are handled by COP, so I roll
// my own stuff.
// Is this good use of shared_ptr?
std::shared_ptr<CompilationDatabase> db = nullptr;
// First try to get string of cxxflags after '--', as loadFromCommandLine requires
std::string ErrorMessage;
db = FixedCompilationDatabase::loadFromCommandLine(argc, argv, ErrorMessage);
if(db) {
std::cout << "COMPILE COMMAND: ";
for(auto cc : db->getCompileCommands("")){
for(auto s : cc.CommandLine)
std::cout << s << " ";
std::cout << std::endl;
}
} else {
std::cout << "Could not load compile command from command line \n" +
ErrorMessage;
}
// Only now can call this method, otherwise compile command could be interpreted
// as input or output file since those are positional
// This usage is encouraged this way according to
// https://clang.llvm.org/doxygen/classclang_1_1tooling_1_1FixedCompilationDatabase.html#a1443b7812e6ffb5ea499c0e880de75fc
llvm::cl::ParseCommandLineOptions(argc, argv, "cxx-langstat is a clang-based"
"tool for computing statistics on C/C++ code on the clang AST level");
std::vector<std::string> InputFiles;
std::vector<std::string> OutputFiles;
bool Files = !InputFilesOption.empty();
bool Dir = !InputDirOption.empty();
if(Files && Dir){
assert(false && "Don't specify input files and directory "
"at the same time\n");
}
// Ensure dirs end with "/"
if(!InputDirOption.empty() && !StringRef(InputDirOption).consume_back("/"))
InputDirOption += "/";
if(!OutputDirOption.empty() && !StringRef(OutputDirOption).consume_back("/"))
OutputDirOption += "/";
if (!InputDirOption.empty() && !OutputDirOption.empty())
copyDirectoryStructure(InputDirOption.getValue(), OutputDirOption.getValue());
if(Files){
InputFiles = InputFilesOption;
} else {
InputFiles = getFiles(InputDirOption, PipelineStage);
// sort by file size
std::sort(InputFiles.begin(), InputFiles.end(), [](const std::string& a, const std::string& b){
return fs::file_size(a) < fs::file_size(b);
});
}
// When multiple output files are a fact (multiple input files) or very
// likely (input dir), require OutputDirOption instead of OutputFileOption to be used.
// OutputFiles can only be used when it is guaranteed to be only a single output file.
// emit-features creates one output per input.
if(PipelineStage == emit_features){
if(Files && InputFiles.size() == 1){ // single file
if(!OutputDirOption.empty()) // may not specify output dir
assert(false && "Don't specify an output dir for a single output\n");
if(!OutputFileOption.empty()){ // place at output file specified
OutputFiles.emplace_back(OutputFileOption);
} else { // create output file if none specified
StringRef filename = llvm::sys::path::filename(InputFiles[0]);
OutputFiles.emplace_back(filename.str() + ".json");
}
} else if((Files && InputFiles.size() > 1) || Dir) { // multiple files
if(!OutputFileOption.empty()) // may not specify output file
assert(false && "Don't specify an output file when multiple outputs are expected\n");
if(OutputDirOption.empty()){ // obliged to specify output dir
assert(false && "Please specify an output dir\n");
} else {
if (InputDirOption.empty()){
for(const auto& File : InputFiles){ // place at output dir specified
StringRef filename = llvm::sys::path::filename(File);
OutputFiles.emplace_back(OutputDirOption + filename.str() + ".json");
}
}
else {
for(const auto& File : InputFiles){ // place at output dir specified
// TODO: copy structure if there are multiple -in flags
StringRef filepath = StringRef(File).drop_front(InputDirOption.size());
OutputFiles.emplace_back(OutputDirOption + filepath.str() + ".json");
}
}
}
}
assert(InputFiles.size() == OutputFiles.size());
// When -emit-statistics option is used, only zero or one output file is ok.
// Output dir is not ok, since the output is guaranteed to be only a single file.
} else if(PipelineStage == emit_statistics){
if(!OutputDirOption.empty())
assert(false && "Don't specify an output dir for a single output\n");
if(OutputFileOption.empty())
OutputFiles.emplace_back("./stats.json");
if(!OutputFileOption.empty())
OutputFiles.emplace_back(OutputFileOption);
assert(OutputFiles.size() == 1);
}
// Now we know all input and output files
std::cout << "input files(" << InputFiles.size() << "): ";
for(const auto& InputFile : InputFiles){
std::cout << InputFile << '\n';
if(StringRef(InputFile).consume_back("/")){
std::cout << "Specified input dir, quitting..." << std::endl;
exit(1);
}
}
std::cout << std::endl;
// Depending on what stage is requested, emit features in parallel
// or emit statistics sequentially
int return_code = 0;
if(PipelineStage == emit_features){
return_code = ParallelEmitFeatures(InputFiles, OutputFiles, PipelineStage,
AnalysesOption, BuildPath, db, ParallelismOption);
} else if(PipelineStage == emit_statistics){
return_code = CXXLangstatMain(InputFiles, OutputFiles, PipelineStage,
AnalysesOption, BuildPath, db);
}
std::cout << "cxx-langstat finished with return code " << return_code << "." << std::endl;
return return_code;
}