-
Notifications
You must be signed in to change notification settings - Fork 285
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore(satp-hermes): crash recovery architecture
Signed-off-by: Rafael Belchior <rafael.belchior@tecnico.ulisboa.pt>
- Loading branch information
1 parent
69a2ea2
commit 0de9744
Showing
14 changed files
with
569 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 2 additions & 0 deletions
2
...ages/cactus-plugin-satp-hermes/src/main/typescript/blo/recover/recover-handler-service.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
// handler to allow a user application to communicate a gateway it crashed and needs to be recovered. It "forces" and update of status with a counterparty gateway | ||
// TODO update the spec with a RecoverForce message that is handled by this handler |
2 changes: 2 additions & 0 deletions
2
...ges/cactus-plugin-satp-hermes/src/main/typescript/blo/recover/rollback-handler-service.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
// handler to allow a user application to force a rollback | ||
// TODO update the spec with RollbackForce message that is handled by this handler |
199 changes: 199 additions & 0 deletions
199
packages/cactus-plugin-satp-hermes/src/main/typescript/core/recovery/crash-manager.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
import { | ||
Logger, | ||
LoggerProvider, | ||
Checks, | ||
LogLevelDesc, | ||
} from "@hyperledger/cactus-common"; | ||
import { SessionData } from "../../generated/proto/cacti/satp/v02/common/session_pb"; | ||
import { CrashRecoveryHandler } from "./crash-recovery-handler"; | ||
import { SATPSession } from "../satp-session"; | ||
import { | ||
RollbackState, | ||
RollbackStrategy, | ||
RollbackStrategyFactory, | ||
} from "./rollback/rollback-strategy-factory"; | ||
|
||
enum CrashStatus { | ||
IN_RECOVERY = "IN_RECOVERY", | ||
RECOVERED = "RECOVERED", | ||
NO_CRASH = "NO_CRASH", | ||
} | ||
|
||
class CrashOccurrence { | ||
constructor( | ||
public status: CrashStatus, | ||
public time: Date, | ||
public lastUpdate: Date, | ||
) {} | ||
} | ||
|
||
export interface ICrashRecoveryManagerOptions { | ||
logLevel?: LogLevelDesc; | ||
instanceId: string; | ||
} | ||
|
||
export class CrashRecoveryManager { | ||
public static readonly CLASS_NAME = "CrashRecoveryManager"; | ||
private readonly log: Logger; | ||
private readonly instanceId: string; | ||
private readonly sessions: Map<string, SessionData>; | ||
private crashRecoveryHandler: CrashRecoveryHandler; | ||
private factory: RollbackStrategyFactory; | ||
|
||
constructor(public readonly options: ICrashRecoveryManagerOptions) { | ||
const fnTag = `${CrashRecoveryManager.CLASS_NAME}#constructor()`; | ||
Checks.truthy(options, `${fnTag} arg options`); | ||
|
||
const level = this.options.logLevel || "DEBUG"; | ||
const label = this.className; | ||
this.log = LoggerProvider.getOrCreate({ level, label }); | ||
this.instanceId = options.instanceId; | ||
this.sessions = this.getSessions() || new Map<string, SessionData>(); | ||
this.log.info(`Instantiated ${this.className} OK`); | ||
this.factory = new RollbackStrategyFactory(); | ||
} | ||
|
||
get className(): string { | ||
return CrashRecoveryManager.CLASS_NAME; | ||
} | ||
|
||
private getSessions(): Map<string, SessionData> { | ||
// todo read from local log to get session data | ||
return new Map<string, SessionData>(); | ||
} | ||
|
||
// todo create util functoin that retrieves sessionid and checks if it is valid; i believe it is implemented in the satp services, refactor making it reusable | ||
private checkCrash(sessionId: SATPSession): Promise<boolean> { | ||
// todo implement crash check - check logs and understsands if there was a crash; might use timouts, etc | ||
return Promise.resolve(false); | ||
} | ||
|
||
public async setupCrashManager() { | ||
// todo setup handler, need to create services | ||
this.crashRecoveryHandler = new CrashRecoveryHandler({ | ||
loggerOptions: { | ||
label: "CrashRecoveryHandler", | ||
level: "DEBUG", | ||
}, | ||
serverService: null, | ||
clientService: null, | ||
sessions: this.sessions, | ||
}); | ||
} | ||
|
||
public async checkAndResolveCrash(sessionId: SATPSession): Promise<boolean> { | ||
const fnTag = `${this.className}#checkAndResolveCrash()`; | ||
this.log.info(`${fnTag} Checking crash status for session ${sessionId}`); | ||
|
||
try { | ||
const didCrash = await this.checkCrash(sessionId); | ||
if (didCrash) { | ||
// create new occurrence | ||
const crashOccurrence = new CrashOccurrence( | ||
CrashStatus.IN_RECOVERY, | ||
new Date(), | ||
new Date(), | ||
); | ||
this.log.debug(crashOccurrence); | ||
// todo manage occurrence | ||
// call corresponding services via handler for crash recovery | ||
return false; | ||
} else { | ||
this.log.error( | ||
`${fnTag} Failed to resolve crash for session ${sessionId}`, | ||
); | ||
// should panic and stop the server, for manual inspection | ||
return false; | ||
} | ||
} catch (error) { | ||
this.log.error( | ||
`${fnTag} Error during crash check and resolution: ${error}`, | ||
); | ||
return false; | ||
} | ||
} | ||
|
||
public async initiateRollback( | ||
session: SATPSession, | ||
forceRollback?: boolean, | ||
): Promise<boolean> { | ||
const fnTag = `CrashRecoveryManager#initiateRollback()`; | ||
this.log.info( | ||
`${fnTag} Initiating rollback for session ${session.getSessionId()}`, | ||
); | ||
|
||
try { | ||
// Implement check for rollback (needs to read logs, etc) OR we assume that at satp handler/service layer this check is done and rollback is good to do | ||
const shouldRollback = true; // todo implement check | ||
|
||
if (forceRollback || shouldRollback) { | ||
// send bridge manager and possibly others to factory | ||
const strategy = this.factory.createStrategy(session); | ||
const rollbackState = await this.executeRollback(strategy, session); | ||
|
||
if (rollbackState) { | ||
const cleanupSuccess = await this.performCleanup( | ||
strategy, | ||
session, | ||
rollbackState, | ||
); | ||
return cleanupSuccess; | ||
} else { | ||
this.log.error( | ||
`${fnTag} Rollback execution failed for session ${session.getSessionId()}`, | ||
); | ||
return false; | ||
} | ||
} else { | ||
this.log.info( | ||
`${fnTag} Rollback not needed for session ${session.getSessionId()}`, | ||
); | ||
return true; | ||
} | ||
} catch (error) { | ||
this.log.error(`${fnTag} Error during rollback initiation: ${error}`); | ||
return false; | ||
} | ||
} | ||
|
||
private async executeRollback( | ||
strategy: RollbackStrategy, | ||
session: SATPSession, | ||
): Promise<RollbackState | undefined> { | ||
const fnTag = `CrashRecoveryManager#executeRollback`; | ||
this.log.debug( | ||
`${fnTag} Executing rollback strategy for session ${session.getSessionId()}`, | ||
); | ||
|
||
try { | ||
return await strategy.execute(session); | ||
} catch (error) { | ||
this.log.error(`${fnTag} Error executing rollback strategy: ${error}`); | ||
} | ||
} | ||
|
||
private async performCleanup( | ||
strategy: RollbackStrategy, | ||
session: SATPSession, | ||
state: RollbackState, | ||
): Promise<boolean> { | ||
const fnTag = `CrashRecoveryManager#performCleanup`; | ||
this.log.debug( | ||
`${fnTag} Performing cleanup after rollback for session ${session.getSessionId()}`, | ||
); | ||
|
||
try { | ||
const updatedState = await strategy.cleanup(session, state); | ||
|
||
// TODO: Handle the updated state, perhaps update session data or perform additional actions | ||
this.log.info( | ||
`${fnTag} Cleanup completed. Updated state: ${JSON.stringify(updatedState)}`, | ||
); | ||
|
||
return true; | ||
} catch (error) { | ||
this.log.error(`${fnTag} Error during cleanup: ${error}`); | ||
return false; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.