Skip to content

Commit

Permalink
Merge pull request #279 from madgik/feat/fault_tolerance
Browse files Browse the repository at this point in the history
Feat/fault tolerance
  • Loading branch information
sofiakarb authored Sep 14, 2020
2 parents 6d57c07 + 3e50727 commit 6b56339
Show file tree
Hide file tree
Showing 78 changed files with 1,193 additions and 1,327 deletions.
529 changes: 234 additions & 295 deletions Exareme-Docker/files/root/exareme/bootstrap.sh

Large diffs are not rendered by default.

13 changes: 2 additions & 11 deletions Exareme-Docker/files/root/exareme/exareme-admin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@ if [[ -z ${EXAREME_HOME} ]]; then
export EXAREME_HOME="$(pwd)";
fi
fi
echo "EXAREME HOME DIR: $EXAREME_HOME";

#load environmental variables like JAVA, python
. ./exareme-env.sh &> /dev/null

#maybe simply pass MASTER_IP from bootstrap
EXAREME_MASTER=`/sbin/ifconfig eth0 | grep "inet" | awk -F: '{print $2}' | cut -d ' ' -f 1`;
echo "EXAREME_HOST : $EXAREME_MASTER";
echo "EXAREME_USER: $EXAREME_USER";

####################################################################################################
# parse command line arguments
####################################################################################################
Expand Down Expand Up @@ -106,18 +103,12 @@ function start_exareme(){ #Starts exareme daemon
-Dcom.sun.management.jmxremote.ssl=false \
-Djava.security.egd=file:///dev/urandom "

DESC="exareme-master"
EXAREME_ADMIN_CLASS=${EXAREME_ADMIN_MASTER_CLASS}

echo ${EXAREME_ADMIN_CLASS_PATH}
echo ${EXAREME_JAVA}
echo ${EXAREME_ADMIN_CLASS}
echo ${EXAREME_MASTER}

mkdir -p /tmp/exareme/var/log /tmp/exareme/var/run

$EXAREME_JAVA -cp $EXAREME_ADMIN_CLASS_PATH \
$EXAREME_ADMIN_OPTS $EXAREME_ADMIN_CLASS > /var/log/exareme.log 2>&1 & echo $! > /tmp/exareme/var/run/$DESC.pid #-cp requires class path specification
$EXAREME_ADMIN_OPTS $EXAREME_ADMIN_CLASS > /var/log/exareme.log 2>&1 & echo $! > /tmp/exareme/var/run/exareme-master.pid #-cp requires class path specification

exit 0

Expand Down
2 changes: 1 addition & 1 deletion Exareme-Docker/files/root/exareme/set-local-datasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ do

pathology=$(basename ${PATHOLOGY})

curl -s -X PUT -d @- ${CONSULURL}/v1/kv/${DATA}/${NODE_NAME}/${pathology} <<< ${PATHOLOGY_DATASETS}
curl -s -X PUT -d @- ${CONSULURL}/v1/kv/${CONSUL_DATA_PATH}/${NODE_NAME}/${pathology} <<< ${PATHOLOGY_DATASETS}

PATHOLOGY_DATASETS=''
fi
Expand Down
14 changes: 3 additions & 11 deletions Exareme-Docker/files/root/exareme/start-worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ EXAREME_ADMIN_WORKER_CLASS="madgik.exareme.worker.admin.StartWorker"
EXAREME_ADMIN_OPTS="${EXAREME_JAVA_OPTS} \
-Djava.rmi.server.codebase=file:$EXAREME_HOME/lib/exareme/ \
-Djava.security.policy=$EXAREME_HOME/etc/exareme/art.policy\
-Djava.rmi.server.hostname=$MY_IP \
-Djava.rmi.server.hostname=$NODE_IP \
-Dsun.rmi.activation.execTimeout=$NODE_COMMUNICATION_TIMEOUT \
-Dsun.rmi.activation.groupTimeout=$NODE_COMMUNICATION_TIMEOUT \
-Dsun.rmi.dgc.ackTimeout=$NODE_COMMUNICATION_TIMEOUT \
Expand All @@ -20,22 +20,14 @@ EXAREME_ADMIN_OPTS="${EXAREME_JAVA_OPTS} \
-Dcom.sun.management.jmxremote.ssl=false \
-Djava.security.egd=file:///dev/urandom "

DESC="exareme-worker"

EXAREME_ADMIN_CLASS=${EXAREME_ADMIN_WORKER_CLASS}
EXAREME_ADMIN_CLASS_ARGS=${MASTER_IP}

echo ${EXAREME_ADMIN_CLASS_PATH}
echo ${EXAREME_JAVA}
echo ${EXAREME_ADMIN_CLASS}
echo ${EXAREME_ADMIN_CLASS_ARGS}


mkdir -p /tmp/exareme/var/log /tmp/exareme/var/run

${EXAREME_JAVA} -cp ${EXAREME_ADMIN_CLASS_PATH} \
${EXAREME_ADMIN_OPTS} ${EXAREME_ADMIN_CLASS} \
${EXAREME_ADMIN_CLASS_ARGS} > /var/log/exareme.log 2>&1 & echo $! > /tmp/exareme/var/run/${DESC}.pid
${EXAREME_ADMIN_CLASS_ARGS} > /var/log/exareme.log 2>&1 & echo $! > /tmp/exareme/var/run/exareme-worker.pid


echo "${DESC} started."
echo "Worker started."
8 changes: 8 additions & 0 deletions Exareme-Docker/src/exareme/exareme-master/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public static void main(String[] args) throws Exception {

int registryPort = AdpProperties.getArtProps().getInt("art.registry.rmi.defaultPort");
int dataTransferPort = AdpProperties.getArtProps().getInt("art.container.data.port");
String logLevel = AdpProperties.getArtProps().getString("art.log.level");
String logLevel = System.getenv("LOG_LEVEL");
Logger.getRootLogger().setLevel(Level.toLevel(logLevel));

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ private boolean updateProgressStatistics() throws RemoteException {
statsOldOP = operatorsCompleted;
return true;
}
} catch (UnmarshalException _) {
} catch (UnmarshalException e) {
log.error("Cannot decode information ...");
}
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public void run() throws Exception {
log.debug("Skip saving tables (" + dbOp.getQuery().getOutputTable().getTable().getName()
+ ") ... ");
}
log.info("Currently executing: \n " + state.toString());
log.debug("Currently executing: \n " + state.toString());
exit(0, state.getExitMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
import madgik.exareme.master.engine.iterations.state.IterationsStateManager;
import madgik.exareme.master.engine.iterations.state.IterationsStateManagerImpl;
import madgik.exareme.master.engine.iterations.state.IterativeAlgorithmState;
import madgik.exareme.master.queryProcessor.composer.AlgorithmProperties;
import madgik.exareme.master.queryProcessor.composer.Composer;
import madgik.exareme.master.queryProcessor.composer.Exceptions.ComposerException;
import madgik.exareme.master.queryProcessor.HBP.AlgorithmProperties;
import madgik.exareme.master.queryProcessor.HBP.Composer;
import madgik.exareme.master.queryProcessor.HBP.Exceptions.ComposerException;
import madgik.exareme.worker.art.container.ContainerProxy;
import org.apache.log4j.Logger;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package madgik.exareme.master.engine.iterations.handler;

import madgik.exareme.common.consts.HBPConstants;
import madgik.exareme.master.queryProcessor.composer.Algorithms;
import org.apache.log4j.Logger;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import madgik.exareme.master.client.AdpDBClientQueryStatus;
import madgik.exareme.master.connector.DataSerialization;
import madgik.exareme.master.engine.iterations.state.IterativeAlgorithmState;
import madgik.exareme.master.gateway.async.handler.HBP.HBPQueryHelper;
import org.apache.http.entity.BasicHttpEntity;
import org.apache.http.nio.ContentEncoder;
import org.apache.http.nio.IOControl;
Expand Down Expand Up @@ -124,24 +125,24 @@ public void produceContent(ContentEncoder encoder, IOControl ioctrl) throws IOEx
String result = iterativeAlgorithmState.getAlgorithmError();
if (result.contains("ExaremeError:")) {
String data = result.substring(result.lastIndexOf("ExaremeError:") + "ExaremeError:".length()).replaceAll("\\s", " ");
String type = user_error;
String output = defaultOutputFormat(data,type);
String type = HBPQueryHelper.ErrorResponse.ErrorResponseTypes.user_error;
String output = HBPQueryHelper.ErrorResponse.createErrorResponse(data, type);
logErrorMessage(output);
channel = Channels.newChannel(
new ByteArrayInputStream(output.getBytes(StandardCharsets.UTF_8)));

} else if (result.contains("PrivacyError")) {
String data = "The Experiment could not run with the input provided because there are insufficient data.";
String type = warning;
String output = defaultOutputFormat(data,type);
String type = HBPQueryHelper.ErrorResponse.ErrorResponseTypes.warning;
String output = HBPQueryHelper.ErrorResponse.createErrorResponse(data, type);
logErrorMessage(output);
channel = Channels.newChannel(
new ByteArrayInputStream(output.getBytes(StandardCharsets.UTF_8)));

} else if (result.matches("java.rmi.RemoteException: Containers:.*not responding")) {
String data = "One or more containers are not responding. Please inform the system administrator.";
String type = error;
String output = defaultOutputFormat(data,type);
String type = HBPQueryHelper.ErrorResponse.ErrorResponseTypes.error;
String output = HBPQueryHelper.ErrorResponse.createErrorResponse(data, type);
logErrorMessage(output);
channel = Channels.newChannel(
new ByteArrayInputStream(output.getBytes(StandardCharsets.UTF_8)));
Expand All @@ -150,8 +151,8 @@ public void produceContent(ContentEncoder encoder, IOControl ioctrl) throws IOEx
String data = "Something went wrong with the execution of algorithm: ["
+ iterativeAlgorithmState.getAlgorithmKey()
+ "]. Please inform your system administrator to consult the logs.";
String type = error;
String output = defaultOutputFormat(data,type);
String type = HBPQueryHelper.ErrorResponse.ErrorResponseTypes.error;
String output = HBPQueryHelper.ErrorResponse.createErrorResponse(data, type);
logErrorMessage(output);
channel = Channels.newChannel(
new ByteArrayInputStream(output.getBytes(StandardCharsets.UTF_8)));
Expand Down Expand Up @@ -184,11 +185,7 @@ public boolean isRepeatable() {
return false;
}

private String defaultOutputFormat(String data, String type){
return "{\"result\" : [{\"data\":"+"\""+data+"\",\"type\":"+"\""+type+"\"}]}";
}

private void logErrorMessage(String error){
private void logErrorMessage(String error) {
log.info("Algorithm exited with error and returned:\n " + error);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import madgik.exareme.master.engine.iterations.handler.IterationsConstants;
import madgik.exareme.master.engine.iterations.handler.IterationsHandlerDFLUtils;
import madgik.exareme.master.engine.iterations.state.exceptions.IterationsStateFatalException;
import madgik.exareme.master.queryProcessor.composer.AlgorithmProperties;
import madgik.exareme.master.queryProcessor.HBP.AlgorithmProperties;
import org.apache.commons.lang3.text.StrSubstitutor;
import org.apache.http.nio.IOControl;
import org.apache.log4j.Logger;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import madgik.exareme.master.gateway.ExaremeGateway;
import madgik.exareme.master.gateway.ExaremeGatewayUtils;
import madgik.exareme.master.gateway.async.handler.*;
import madgik.exareme.master.gateway.async.handler.HBP.HBPQueryHandler;
import madgik.exareme.master.gateway.control.handler.HttpAsyncCheckWorker;
import madgik.exareme.master.gateway.control.handler.HttpAsyncRemoveWorkerHandler;
import org.apache.http.config.ConnectionConfig;
Expand Down Expand Up @@ -61,8 +62,8 @@ public HttpAsyncExaremeGateway(AdpDBManager manager) throws Exception {
registry.register(ExaremeGatewayUtils.GW_API_QUERY, new HttpAsyncQueryHandler());
registry.register(ExaremeGatewayUtils.GW_API_TABLE, new HttpAsyncTableHandler());
registry.register(ExaremeGatewayUtils.GW_API_MINING_ALGORITHMS, new HttpAsyncMiningAlgorithmsHandler());
registry.register(ExaremeGatewayUtils.GW_API_MINING_QUERY, new HttpAsyncMiningQueryHandler());
registry.register("/v1/mining/*", new HttpAsyncMiningQueryHandler());
registry.register(ExaremeGatewayUtils.GW_API_MINING_QUERY, new HBPQueryHandler());
registry.register("/v1/mining/*", new HBPQueryHandler());

final HttpAsyncService handler = new HttpAsyncService(httpproc, null, null, registry, null, null);

Expand Down

This file was deleted.

This file was deleted.

Loading

0 comments on commit 6b56339

Please sign in to comment.