From 92efe24581bc06fd5f72b39da6d9e383ae8977c1 Mon Sep 17 00:00:00 2001 From: ClimbsRocks Date: Wed, 7 Oct 2015 08:07:09 -0700 Subject: [PATCH] cleans up code and logs --- neuralNet/controllerNN.js | 1 - neuralNet/dataFormatting/readAndFormatData.js | 5 +---- neuralNet/utils/makeExtendedTrainingObj.js | 1 - ppLib.js | 10 ++++------ pySetup/controllerPython.js | 2 +- pySetup/makePredictions.py | 4 ---- pySetup/parameterMakers/rfParamMaker.py | 4 ++-- pySetup/processes.js | 11 ++++++++--- pySetup/training.py | 13 ++++--------- 9 files changed, 20 insertions(+), 31 deletions(-) diff --git a/neuralNet/controllerNN.js b/neuralNet/controllerNN.js index 8ef036b..774373f 100644 --- a/neuralNet/controllerNN.js +++ b/neuralNet/controllerNN.js @@ -23,7 +23,6 @@ module.exports = { }, startTraining: function() { utils.setGlobalVars(); - console.log('dataFile:',argv.dataFile); readAndFormatData(function() { // nn.dataSummary just got set by readAndFormatData, asynchronously; diff --git a/neuralNet/dataFormatting/readAndFormatData.js b/neuralNet/dataFormatting/readAndFormatData.js index e761311..33f7f5c 100644 --- a/neuralNet/dataFormatting/readAndFormatData.js +++ b/neuralNet/dataFormatting/readAndFormatData.js @@ -24,7 +24,6 @@ module.exports = function( callback) { // NOTE: your data must be formatted using UTF-8. If you're getting weird errors and you're not sure how to do that, check out this blog post: // TODO: add in info on how to make sure your data is formatted using UTF-8 var readStream = fs.createReadStream(path.join(global.rootDir, argv.dataFile), {encoding: 'utf8'}); - console.log('we have created the write and read streams to format our data') var tStream1 = formattingUtils.summarizeDataTransformStream(); @@ -36,7 +35,6 @@ module.exports = function( callback) { readStream.pipe(tStream1).pipe(writeStream1); writeStream1.on('finish', function() { - console.log('heard a finish event to writeSream'); // to deal with asynch issues, we are attaching the dataSummary object to tStream1 itself. // set the average property on each dataSummary key @@ -56,7 +54,6 @@ module.exports = function( callback) { writeStream2.on('finish', function() { - console.log('finished the second transform!'); for(var column in nn.dataSummary) { var columnObj = nn.dataSummary[column]; @@ -86,10 +83,10 @@ module.exports = function( callback) { readStream3.pipe(tStream3).pipe(writeStream3); writeStream3.on('finish', function() { - console.log('finished the third transform!'); var trainingTime = (Date.now() - t2Start) / 1000; console.log('third transformStream took:',trainingTime); + // delete the intermediate files we have created fs.unlink(path.join(nn.location,'/formattingData.txt')); fs.unlink(path.join(nn.location,'/formattingData2.txt')); if(argv.copyData) { diff --git a/neuralNet/utils/makeExtendedTrainingObj.js b/neuralNet/utils/makeExtendedTrainingObj.js index be4f654..d8f199d 100644 --- a/neuralNet/utils/makeExtendedTrainingObj.js +++ b/neuralNet/utils/makeExtendedTrainingObj.js @@ -4,7 +4,6 @@ var argv = global.argv; module.exports = function ( hlArray) { - console.log('nn.bestNetObj:',nn.bestNetObj); // NOTE: these are the max training time parameters we can set. we will use other processes to decide when to kill off the net. var trainingObj = { errorThresh: 0.0, // error threshold to reach diff --git a/ppLib.js b/ppLib.js index 7ad33df..63cc26d 100644 --- a/ppLib.js +++ b/ppLib.js @@ -1,9 +1,9 @@ -// global.neuralNetwork = {}; +global.neuralNetwork = {}; global.argv = require('minimist')(process.argv.slice(1)); var path = require('path'); global.rootDir = path.dirname(__filename); -// var controllerNN = require('./neuralNet/controllerNN.js'); +var controllerNN = require('./neuralNet/controllerNN.js'); var controllerPython = require('./pySetup/controllerPython.js'); var controllerEnsemble = require('./ensembling/controller.js'); var dataFile = process.argv[2]; @@ -37,17 +37,15 @@ if (argv.devEnsemble) { // Here is where we invoke the method with the path to the data // we pass in a callback function that will make the dataSummary a global variable // and invoke parallelNets once formatting the data is done. - // argv.numCPUs = argv.computerTotalCPUs/2; - // controllerNN.startTraining(); + controllerNN.startTraining(); // ********************************************************************************** - // argv.numCPUs = argv.computerTotalCPUs/2; controllerPython.startTraining(argv); controllerEnsemble.startListeners(2, argv); } var ppLibShutdown = function() { - // controllerNN.killAll(); + controllerNN.killAll(); controllerPython.killAll(); }; // kills off all the child processes if the parent process faces an uncaught exception and crashes. diff --git a/pySetup/controllerPython.js b/pySetup/controllerPython.js index 07e60d0..c983ee3 100644 --- a/pySetup/controllerPython.js +++ b/pySetup/controllerPython.js @@ -29,7 +29,7 @@ module.exports = { processes.kickOffForestTraining( function() { // TODO: add in next step in chain here module.exports.makePredictions(); - }); + }, 'clRandomForest'); }); // } diff --git a/pySetup/makePredictions.py b/pySetup/makePredictions.py index 9c29c7d..2e783e9 100644 --- a/pySetup/makePredictions.py +++ b/pySetup/makePredictions.py @@ -12,7 +12,6 @@ fileNames = json.loads(sys.argv[4]) classifierName = sys.argv[5] -obviousPrint('classifierName',classifierName) y_file_name = fileNames['y_predict'] X_file_name = fileNames['X_predict'] @@ -71,14 +70,11 @@ rowID = int(float(inputRow[idIndex])) try: len(prediction) - # printParent('we are in the try block') csvwriter.writerow([rowID,prediction[1]]) except: csvwriter.writerow([rowID,prediction]) - # printParent('we are in the exception block') -# write those predictions to a single, standalone, centralized file that ONLY holds the ID for that row, and then the predictions for each model. # Nope. Each classifier writes it's own predictions to it's own file. # we will keep an array in ppLib.js that has references to all the file names # the files will all be in a predictions folder, that will hold nothing but these files holding the predictions from a single classifier diff --git a/pySetup/parameterMakers/rfParamMaker.py b/pySetup/parameterMakers/rfParamMaker.py index 85069b4..c5e10d7 100644 --- a/pySetup/parameterMakers/rfParamMaker.py +++ b/pySetup/parameterMakers/rfParamMaker.py @@ -9,8 +9,8 @@ def makeParams(X, y, globalArgs): parameters_to_try = { - # 'max_features': max_features_to_try, - # 'min_samples_leaf':[1,2,5,25,50,100,150], + 'max_features': max_features_to_try, + 'min_samples_leaf':[1,2,5,25,50,100,150], 'criterion': ['gini','entropy'] } diff --git a/pySetup/processes.js b/pySetup/processes.js index 29fc2b3..758cd91 100644 --- a/pySetup/processes.js +++ b/pySetup/processes.js @@ -42,12 +42,17 @@ module.exports = { module.exports.formatData( callback, 'train'); }, - kickOffForestTraining: function( callback) { + kickOffForestTraining: function( callback, classifierName) { // console.log('fileNames:',module.exports.fileNames); - var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), 'clRandomForest']); + var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName]); - utils.startPythonShell('training.py', callback, pythonOptions); + var pyShell = utils.startPythonShell('training.py', callback, pythonOptions); + pyShell.on('message', function(message) { + if(message.type === 'trainingResults') { + global.trainedAlgos[classifierName] = message.text; + } + }); }, makePredictions: function( callback, rfPickle) { diff --git a/pySetup/training.py b/pySetup/training.py index 68d366e..f75d47b 100644 --- a/pySetup/training.py +++ b/pySetup/training.py @@ -63,29 +63,25 @@ classifier = classifierCreater[classifierName] # create features that are custom to the size of the input data. -# this will definitely have to be done individually. -# i don't see any harm in making each of these into their own file, because aside from the dev check, everything here will be custom to each classifier. +# Each individual paramaterMaker file sits in the paramaterMakers folder. If you want to modify what the parameters are, or submit a PR with a better combination of parameters to try, that is the place to start. allParams = paramMakers.makeAll(X,y,globalArgs) parameters_to_try = allParams[classifierName] -# here is where we start to do very similar things all over again. everything from here forwards can be generalized. printParent('we are about to run a grid search over the following space:') printParent(parameters_to_try) gridSearch = GridSearchCV(classifier, parameters_to_try, cv=10, n_jobs=globalArgs['numCPUs']) gridSearch.fit(X_train, y_train) - -printParent('we have used grid search to explore the entire parameter space and find the best possible version of a random forest for your particular data set!') - +printParent('\n') printParent('*********************************************************************************************************') printParent("this estimator's best prediction is:") printParent(gridSearch.best_score_) printParent('*********************************************************************************************************') printParent("this estimator's best parameters are:") printParent(gridSearch.best_params_) -printParent('now that we have figured this out, we are going to train a random forest with considerably more trees. more trees means a better fit, but they also take significantly longer to train, so we kept the number of trees relatively low while searching through the parameter space to make sure you were not stuck here until python6 comes out.') +printParent('\n') if extendedTraining: # create a dict with mappings from algo name ('clRandomForest') to a function that will return a newly instantiated version of that algo (with the proper n_estimators and other custom parameters for that classifier) @@ -95,7 +91,6 @@ # note: we are testing grid search on 50% of the data (X_train and y_train), but fitting bigClassifier on the entire dataset (X,y) bigClassifier.fit(X, y) - printParent('we have trained an even more powerful random forest!') bigClassifierscore = bigClassifier.score(X, y) printParent('the bigger randomForest has a score of') @@ -104,4 +99,4 @@ joblib.dump(bigClassifier, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl') else: joblib.dump(gridSearch.best_estimator_, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl') -printParent('wrote the best estimator to a file') +printParent('we have written the best estimator to a file')