diff --git a/.github/workflows/builddeps.yml b/.github/workflows/builddeps.yml index 75af4105..5313aa3a 100644 --- a/.github/workflows/builddeps.yml +++ b/.github/workflows/builddeps.yml @@ -6,7 +6,7 @@ on: env: GRPC_VER: 1.54.1 BOOST_VER: 1.85.0 - E2SAR_VER: 0.1.1 + E2SAR_VER: 0.1.2 jobs: build: diff --git a/.github/workflows/deploydebs.yml b/.github/workflows/deploydebs.yml index 062d284f..978d0242 100644 --- a/.github/workflows/deploydebs.yml +++ b/.github/workflows/deploydebs.yml @@ -6,7 +6,7 @@ name: Deploy E2sar debs and dependency debs to releases on: workflow_dispatch: env: - E2SAR_VER: 0.1.1 + E2SAR_VER: 0.1.2 E2SAR_DEP: 1.85.0-boost-1.54.1-grpc jobs: diff --git a/.github/workflows/deps2debs.yml b/.github/workflows/deps2debs.yml index 9e7da166..f0eed0ed 100644 --- a/.github/workflows/deps2debs.yml +++ b/.github/workflows/deps2debs.yml @@ -3,7 +3,7 @@ name: Build gRPC+Boost .deb and .rpms on: workflow_dispatch: env: - E2SAR_VER: 0.1.1 + E2SAR_VER: 0.1.2 DEPS_VER: 1.85.0-boost-1.54.1-grpc FINAL_INSTALL: /usr/local @@ -67,7 +67,7 @@ jobs: if [[ ${{ matrix.os }} == *"ubuntu"* ]]; then fpm -s tar -t deb -n e2sar-deps -v ${{ env.E2SAR_VER }} --prefix=/ e2sar-deps.tar.gz elif [[ ${{ matrix.os }} == *"rocky"* ]]; then - fpm -s tar -t deb -n e2sar-deps -v ${{ env.E2SAR_VER }} --prefix=/ e2sar-deps.tar.gz + fpm -s tar -t rpm -n e2sar-deps -v ${{ env.E2SAR_VER }} --prefix=/ e2sar-deps.tar.gz fi - name: Upload artifacts diff --git a/.github/workflows/distro.yml b/.github/workflows/distro.yml index 7e9f720c..a5a6c130 100644 --- a/.github/workflows/distro.yml +++ b/.github/workflows/distro.yml @@ -3,7 +3,7 @@ name: Build E2SAR .deb and .rpms on: workflow_dispatch: env: - E2SAR_VER: 0.1.1 + E2SAR_VER: 0.1.2 E2SAR_DEP: boost-1.85.0-grpc-1.54.1 FINAL_INSTALL: /usr/local @@ -110,7 +110,7 @@ jobs: if [[ ${{ matrix.os }} == *"ubuntu"* ]]; then fpm -s tar -t deb -n e2sar -v ${{ env.E2SAR_VER }} --prefix=/ e2sar.tar.gz elif [[ ${{ matrix.os }} == *"rocky"* ]]; then - fpm -s tar -t deb -n e2sar -v ${{ env.E2SAR_VER }} --prefix=/ e2sar.tar.gz + fpm -s tar -t rpm -n e2sar -v ${{ env.E2SAR_VER }} --prefix=/ e2sar.tar.gz fi - name: Upload artifacts diff --git a/Doxyfile b/Doxyfile index adecb190..9f0d377f 100644 --- a/Doxyfile +++ b/Doxyfile @@ -48,7 +48,7 @@ PROJECT_NAME = "E2SAR" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.1.0 +PROJECT_NUMBER = 0.1.2 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/bin/e2sar_perf.cpp b/bin/e2sar_perf.cpp index 54a8d5a2..df0f802e 100644 --- a/bin/e2sar_perf.cpp +++ b/bin/e2sar_perf.cpp @@ -32,15 +32,33 @@ bool threadsRunning = true; u_int16_t reportThreadSleepMs{1000}; Reassembler *reasPtr{nullptr}; Segmenter *segPtr{nullptr}; +LBManager *lbmPtr{nullptr}; +std::vector senders; void ctrlCHandler(int sig) { std::cout << "Stopping threads" << std::endl; - if (segPtr != nullptr) + if (segPtr != nullptr) { + if (lbmPtr != nullptr) { + std::cout << "Removing senders: "; + for (auto s: senders) + std::cout << s << " "; + std::cout << std::endl; + auto rmres = lbmPtr->removeSenders(senders); + if (rmres.has_error()) + std::cerr << "Unable to remove sender from list on exit: " << rmres.error().message() << std::endl; + } segPtr->stopThreads(); + } if (reasPtr != nullptr) + { + std::cout << "Deregistering worker" << std::endl; + auto deregres = reasPtr->deregisterWorker(); + if (deregres.has_error()) + std::cerr << "Unable to deregister worker on exit: " << deregres.error().message() << std::endl; reasPtr->stopThreads(); + } threadsRunning = false; // instead of join boost::chrono::milliseconds duration(1000); @@ -130,10 +148,18 @@ result sendEvents(Segmenter &s, EventNum_t startEventNum, size_t numEvents, evtBufferPool->free(item); boost::this_thread::sleep_until(until); } + // sleep to allow small number of frames to leave + boost::chrono::seconds duration(1); + boost::this_thread::sleep_for(duration); + auto stats = s.getSendStats(); evtBufferPool->purge_memory(); std::cout << "Completed, " << stats.get<0>() << " frames sent, " << stats.get<1>() << " errors" << std::endl; + if (stats.get<1>() != 0) + { + std::cout << "Last error encountered: " << strerror(stats.get<2>()) << std::endl; + } return 0; } @@ -146,6 +172,24 @@ result recvEvents(Reassembler &r, int durationSec) { EventNum_t evtNum; u_int16_t dataId; + // register the worker (will be NOOP if withCP is set to false) + auto hostname_res = NetUtil::getHostName(); + if (hostname_res.has_error()) + { + return E2SARErrorInfo{hostname_res.error().code(), hostname_res.error().message()}; + } + auto regres = r.registerWorker(hostname_res.value()); + if (regres.has_error()) + { + return E2SARErrorInfo{E2SARErrorc::RPCError, + "Unable to register worker node due to " + regres.error().message()}; + } + if (regres.value() == 1) + std::cout << "Registered the worker" << std::endl; + + // NOTE: if we switch the order of registerWorker and openAndStart + // you get into a race condition where the sendState thread starts and tries + // to send queue updates, however session token is not yet available... auto openRes = r.openAndStart(); if (openRes.has_error()) return openRes; @@ -188,11 +232,21 @@ result recvEvents(Reassembler &r, int durationSec) { void recvStatsThread(Reassembler *r) { + std::vector> lostEvents; + while(threadsRunning) { auto nowT = boost::chrono::high_resolution_clock::now(); auto stats = r->getStats(); + + while(true) + { + auto res = r->get_LostEvent(); + if (res.has_error()) + break; + lostEvents.push_back(res.value()); + } /* * - EventNum_t enqueueLoss; // number of events received and lost on enqueue * - EventNum_t eventSuccess; // events successfully processed @@ -211,6 +265,13 @@ void recvStatsThread(Reassembler *r) if (stats.get<5>() != E2SARErrorc::NoError) std::cout << "\tLast E2SARError code: " << stats.get<5>() << std::endl; + std::cout << "\tEvents lost so far: "; + for(auto evt: lostEvents) + { + std::cout << "<" << evt.first << ":" << evt.second << "> "; + } + std::cout << std::endl; + auto until = nowT + boost::chrono::milliseconds(reportThreadSleepMs); boost::this_thread::sleep_until(until); } @@ -228,10 +289,14 @@ int main(int argc, char **argv) u_int16_t mtu; u_int32_t eventSourceId; u_int16_t dataId; - size_t numThreads; + size_t numThreads, numSockets; float rateGbps; int sockBufSize; int durationSec; + bool withCP; + std::string sndrcvIP; + std::string iniFile; + u_int16_t recvStartPort; // parameters opts("send,s", "send traffic"); @@ -244,11 +309,20 @@ int main(int argc, char **argv) opts("mtu,m", po::value(&mtu)->default_value(1500), "MTU (default 1500) [s]"); opts("src", po::value(&eventSourceId)->default_value(1234), "Event source (default 1234) [s]"); opts("dataid", po::value(&dataId)->default_value(4321), "Data id (default 4321) [s]"); - opts("threads,t", po::value(&numThreads)->default_value(1), "number of receive threads (defaults to 1) [r]"); + opts("threads", po::value(&numThreads)->default_value(1), "number of receive threads (defaults to 1) [r]"); + opts("sockets", po::value(&numSockets)->default_value(4), "number of send sockets (defaults to 4) [r]"); opts("rate", po::value(&rateGbps)->default_value(1.0), "send rate in Gbps (defaults to 1.0)"); opts("period,p", po::value(&reportThreadSleepMs)->default_value(1000), "receive side reporting thread sleep period in ms (defaults to 1000) [r]"); opts("bufsize,b", po::value(&sockBufSize)->default_value(1024*1024*3), "send or receive socket buffer size (default to 3MB)"); - opts("duration,d", po::value(&durationSec)->default_value(0), "duration for receiver to run for (defaults to 0 - until Ctrl-C is presses)"); + opts("duration,d", po::value(&durationSec)->default_value(0), "duration for receiver to run for (defaults to 0 - until Ctrl-C is pressed)"); + opts("withcp,c", po::bool_switch()->default_value(false), "enable control plane interactions"); + opts("ini,i", po::value(&iniFile)->default_value(""), "INI file to initialize SegmenterFlags [s]] or ReassemblerFlags [r]." + " Values found in the file override --withcp, --mtu and --bufsize"); + opts("ip", po::value(&sndrcvIP)->default_value("127.0.0.1"), "IP address (IPv4 or IPv6) from which sender sends from or on which receiver listens. Defaults to 127.0.0.1. [s,r]"); + opts("port", po::value(&recvStartPort)->default_value(10000), "Starting UDP port number on which receiver listens. Defaults to 10000. [r] "); + opts("ipv6,6", "force using IPv6 control plane address if URI specifies hostname (disables cert validation) [s,r]"); + opts("ipv4,4", "force using IPv4 control plane address if URI specifies hostname (disables cert validation) [s,r]"); + opts("novalidate,v", "don't validate server certificate"); po::variables_map vm; @@ -270,6 +344,11 @@ int main(int argc, char **argv) conflicting_options(vm, "recv", "rate"); conflicting_options(vm, "send", "threads"); conflicting_options(vm, "send", "period"); + conflicting_options(vm, "ipv4", "ipv6"); + option_dependency(vm, "recv", "ip"); + option_dependency(vm, "recv", "port"); + option_dependency(vm, "send", "ip"); + conflicting_options(vm, "send", "port"); } catch (const std::logic_error &le) { @@ -284,14 +363,31 @@ int main(int argc, char **argv) return 0; } + withCP = vm["withcp"].as(); + + bool preferV6 = false; + if (vm.count("ipv6")) + { + preferV6 = true; + } + + // if ipv4 or ipv6 requested explicitly + bool preferHostAddr = false; + if (vm.count("ipv6") || vm.count("ipv4")) + preferHostAddr = true; + + bool validate = true; + if (vm.count("novalidate")) + validate = false; + // make sure the token is interpreted as the correct type, depending on the call EjfatURI::TokenType tt{EjfatURI::TokenType::instance}; std::string ejfat_uri; if (vm.count("send") || vm.count("recv")) { - auto uri_r = (vm.count("uri") ? EjfatURI::getFromString(vm["uri"].as(), tt) : - EjfatURI::getFromEnv("EJFAT_URI"s, tt)); + auto uri_r = (vm.count("uri") ? EjfatURI::getFromString(vm["uri"].as(), tt, preferV6) : + EjfatURI::getFromEnv("EJFAT_URI"s, tt, preferV6)); if (uri_r.has_error()) { std::cerr << "Error in parsing URI from command-line, error "s + uri_r.error().message() << std::endl; @@ -299,10 +395,49 @@ int main(int argc, char **argv) } auto uri = uri_r.value(); if (vm.count("send")) { + // if using control plane + if (withCP) + { + // add to senders list of 1 element + senders.push_back(sndrcvIP); + + // create LBManager + lbmPtr = new LBManager(uri, validate, preferHostAddr); + + // register senders + std::cout << "Adding senders to LB: "; + for (auto s: senders) + std::cout << s << " "; + std::cout << std::endl; + auto addres = lbmPtr->addSenders(senders); + if (addres.has_error()) + { + std::cerr << "Unable to add a sender due to error " << addres.error().message() + << ", exiting" << std::endl; + return -1; + } + } + Segmenter::SegmenterFlags sflags; - sflags.useCP = false; // turn off CP sync - sflags.mtu = mtu; - sflags.sndSocketBufSize = sockBufSize; + if (!iniFile.empty()) + { + std::cout << "Loading SegmenterFlags from " << iniFile << std::endl; + auto sflagsRes = Segmenter::SegmenterFlags::getFromINI(iniFile); + if (sflagsRes.has_error()) + { + std::cerr << "Unable to parse SegmenterFlags INI file " << iniFile << std::endl; + return -1; + } + sflags = sflagsRes.value(); + } else { + sflags.useCP = withCP; + sflags.mtu = mtu; + sflags.sndSocketBufSize = sockBufSize; + sflags.numSendSockets = numSockets; + } + std::cout << "Control plane will be " << (sflags.useCP ? "ON" : "OFF") << std::endl; + std::cout << (sflags.useCP ? "*** Make sure the LB has been reserved and the URI reflects the reserved instance information." : + "*** Make sure the URI reflects proper data address, other parts are ignored.") << std::endl; try { Segmenter seg(uri, dataId, eventSourceId, sflags); @@ -319,11 +454,31 @@ int main(int argc, char **argv) } else if (vm.count("recv")) { Reassembler::ReassemblerFlags rflags; - rflags.useCP = false; // turn off CP gRPC - rflags.withLBHeader = true; // no LB - rflags.rcvSocketBufSize = sockBufSize; + if (!iniFile.empty()) + { + std::cout << "Loading ReassemblerFlags from " << iniFile << std::endl; + auto rflagsRes = Reassembler::ReassemblerFlags::getFromINI(iniFile); + if (rflagsRes.has_error()) + { + std::cerr << "Unable to parse ReassemblerFlags INI file " << iniFile << std::endl; + return -1; + } + rflags = rflagsRes.value(); + } else + { + rflags.useCP = withCP; + rflags.withLBHeader = not withCP; + rflags.rcvSocketBufSize = sockBufSize; + rflags.useHostAddress = preferHostAddr; + rflags.validateCert = validate; + } + std::cout << "Control plane will be " << (rflags.useCP ? "ON" : "OFF") << std::endl; + std::cout << (rflags.useCP ? "*** Make sure the LB has been reserved and the URI reflects the reserved instance information." : + "*** Make sure the URI reflects proper data address, other parts are ignored.") << std::endl; + try { - Reassembler reas(uri, numThreads, rflags); + ip::address ip = ip::make_address(sndrcvIP); + Reassembler reas(uri, ip, recvStartPort, numThreads, rflags); reasPtr = &reas; boost::thread statT(&recvStatsThread, &reas); auto res = recvEvents(reas, durationSec); diff --git a/bin/lbadm.cpp b/bin/lbadm.cpp index bb2be917..e58d2a90 100644 --- a/bin/lbadm.cpp +++ b/bin/lbadm.cpp @@ -51,7 +51,8 @@ result reserveLB(LBManager &lbman, if(!suppress) { std::cout << "Reserving a new load balancer " << std::endl; - std::cout << " Contacting: " << static_cast(lbman.get_URI()) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << static_cast(lbman.get_URI()) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB Name: " << lbname << std::endl; std::cout << " Allowed senders: "; std::for_each(senders.begin(), senders.end(), [](const std::string& s) { std::cout << s << ' '; }); @@ -85,7 +86,8 @@ result reserveLB(LBManager &lbman, result freeLB(LBManager &lbman, const std::string &lbid = "") { std::cout << "Freeing a load balancer " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::admin) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::admin) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB ID: " << (lbid.empty() ? lbman.get_URI().get_lbId() : lbid) << std::endl; result res{0}; @@ -116,7 +118,8 @@ result registerWorker(LBManager &lbman, const std::string &node_name, if(!suppress) { std::cout << "Registering a worker " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::instance) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::instance) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " Worker details: " << node_name << " at "s << node_ip << ":"s << node_port << std::endl; std::cout << " CP parameters: " << "w="s << weight << ", source_count="s << src_cnt << std::endl; @@ -148,7 +151,8 @@ result registerWorker(LBManager &lbman, const std::string &node_name, result deregisterWorker(LBManager &lbman) { std::cout << "De-Registering a worker " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; auto res = lbman.deregisterWorker(); @@ -167,7 +171,8 @@ result deregisterWorker(LBManager &lbman) result getLBStatus(LBManager &lbman, const std::string &lbid) { std::cout << "Getting LB Status " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB ID: " << (lbid.empty() ? lbman.get_URI().get_lbId() : lbid) << std::endl; auto res = lbman.getLBStatus(lbid); @@ -202,7 +207,8 @@ result getLBStatus(LBManager &lbman, const std::string &lbid) result overview(LBManager &lbman) { std::cout << "Getting Overview " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; auto res = lbman.overview(); @@ -243,7 +249,8 @@ result overview(LBManager &lbman) result sendState(LBManager &lbman, float fill_percent, float ctrl_signal, bool is_ready) { std::cout << "Sending Worker State " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB Name: " << (lbman.get_URI().get_lbName().empty() ? "not set"s : lbman.get_URI().get_lbId()) << std::endl; auto res = lbman.sendState(fill_percent, ctrl_signal, is_ready); @@ -264,7 +271,8 @@ result sendState(LBManager &lbman, float fill_percent, float ctrl_signal, b result removeSenders(LBManager &lbman, const std::vector& senders) { std::cout << "Removing senders to CP " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB Name: " << (lbman.get_URI().get_lbName().empty() ? "not set"s : lbman.get_URI().get_lbId()) << std::endl; std::cout << " Sender list: "; std::for_each(senders.begin(), senders.end(), [](const std::string& s) { std::cout << s << ' '; }); @@ -288,7 +296,8 @@ result removeSenders(LBManager &lbman, const std::vector& send result addSenders(LBManager &lbman, const std::vector& senders) { std::cout << "Adding senders to CP " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB Name: " << (lbman.get_URI().get_lbName().empty() ? "not set"s : lbman.get_URI().get_lbId()) << std::endl; std::cout << " Sender list: "; std::for_each(senders.begin(), senders.end(), [](const std::string& s) { std::cout << s << ' '; }); @@ -313,7 +322,8 @@ result version(LBManager &lbman) { std::cout << "Getting load balancer version " << std::endl; - std::cout << " Contacting: " << static_cast(lbman.get_URI()) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << static_cast(lbman.get_URI()) << " using address: " << + lbman.get_AddrString() << std::endl; auto res = lbman.version(); @@ -339,26 +349,32 @@ int main(int argc, char **argv) po::options_description od("Command-line options"); auto opts = od.add_options()("help,h", "show this help message"); + std::string duration; + float weight; + u_int16_t count; + float queue, ctrl, minfactor, maxfactor; + bool ready; // parameters opts("lbname,l", po::value(), "specify name of the load balancer"); opts("lbid,i", po::value(), "override/provide id of the loadbalancer"); opts("address,a", po::value>()->multitoken(), "node IPv4/IPv6 address, can be used multiple times for 'reserve' call"); - opts("duration,d", po::value(), "specify duration as '[hh[:mm[:ss]]]'"); + opts("duration,d", po::value(&duration)->default_value("02:00:00"), "specify duration as '[hh[:mm[:ss]]]'"); opts("uri,u", po::value(), "specify EJFAT_URI on the command-line instead of the environment variable"); opts("name,n", po::value(), "specify node name for registration"); opts("port,p", po::value(), "node starting listening port number"); - opts("weight,w", po::value(), "node weight"); - opts("count,c", po::value(), "node source count"); + opts("weight,w", po::value(&weight)->default_value(1.0), "node weight"); + opts("count,c", po::value(&count)->default_value(1), "node source count"); opts("session,s", po::value(), "override/provide session id"); - opts("queue,q", po::value(), "queue fill"); - opts("ctrl,t", po::value(), "control signal value"); - opts("ready,r", po::value(), "worker ready state (1 or 0)"); + opts("queue,q", po::value(&queue)->default_value(0.0), "queue fill"); + opts("ctrl,t", po::value(&ctrl)->default_value(0.0), "control signal value"); + opts("ready,r", po::value(&ready)->default_value(true), "worker ready state (1 or 0)"); opts("root,o", po::value(), "root cert for SSL communications"); opts("novalidate,v", "don't validate server certificate (conflicts with 'root')"); - opts("minfactor", po::value(), "node min factor, multiplied with the number of slots that would be assigned evenly to determine min number of slots for example, 4 nodes with a minFactor of 0.5 = (512 slots / 4) * 0.5 = min 64 slots"); - opts("maxfactor", po::value(), "multiplied with the number of slots that would be assigned evenly to determine max number of slots for example, 4 nodes with a maxFactor of 2 = (512 slots / 4) * 2 = max 256 slots set to 0 to specify no maximum"); - opts("ipv6,6", "prefer IPv6 control plane address if URI specifies hostname"); + opts("minfactor", po::value(&minfactor)->default_value(0.5), "node min factor, multiplied with the number of slots that would be assigned evenly to determine min number of slots for example, 4 nodes with a minFactor of 0.5 = (512 slots / 4) * 0.5 = min 64 slots"); + opts("maxfactor", po::value(&maxfactor)->default_value(2.0), "multiplied with the number of slots that would be assigned evenly to determine max number of slots for example, 4 nodes with a maxFactor of 2 = (512 slots / 4) * 2 = max 256 slots set to 0 to specify no maximum"); + opts("ipv6,6", "force using IPv6 control plane address if URI specifies hostname (disables cert validation)"); + opts("ipv4,4", "force using IPv4 control plane address if URI specifies hostname (disables cert validation)"); opts("export,e", "suppresses other messages and prints out 'export EJFAT_URI=' returned by the LB"); // commands opts("reserve", "reserve a load balancer (-l, -a, -d required). Uses admin token."); @@ -372,6 +388,8 @@ int main(int argc, char **argv) opts("addsenders","add 'safe' sender IP addresses to CP (one or more -a required). Uses instance token."); opts("removesenders","remove 'safe' sender IP addresses from CP (one or more -a required). Uses instance token."); + std::vector commands{"reserve", "free", "version", "register", + "deregister", "status", "state", "overview", "addsenders", "removesenders"}; po::variables_map vm; @@ -395,8 +413,18 @@ int main(int argc, char **argv) option_dependency(vm, "state", "ctrl"); option_dependency(vm, "state", "ready"); conflicting_options(vm, "root", "novalidate"); + conflicting_options(vm, "ipv4", "ipv6"); option_dependency(vm,"addsenders", "address"); option_dependency(vm,"removesenders", "address"); + + for (auto c1: commands) + { + for (auto c2: commands) + { + if (c1.compare(c2)) + conflicting_options(vm, c1, c2); + } + } } catch (const std::logic_error &le) { @@ -437,6 +465,11 @@ int main(int argc, char **argv) preferV6 = true; } + // if ipv4 or ipv6 requested explicitly + bool preferHostAddr = false; + if (vm.count("ipv6") || vm.count("ipv4")) + preferHostAddr = true; + std::string ejfat_uri; auto uri_r = (vm.count("uri") ? EjfatURI::getFromString(vm["uri"].as(), tt, preferV6) : EjfatURI::getFromEnv("EJFAT_URI"s, tt, preferV6)); @@ -455,7 +488,7 @@ int main(int argc, char **argv) if (vm.count("lbid")) uri.set_lbId(vm["lbid"].as()); - auto lbman = LBManager(uri); + auto lbman = LBManager(uri, true, preferHostAddr); if (vm.count("root") && !uri.get_useTls()) { @@ -471,14 +504,14 @@ int main(int argc, char **argv) std::cerr << "Unable to read server root certificate file "s; return -1; } - lbman = LBManager(uri, true, opts_res.value()); + lbman = LBManager(uri, true, preferHostAddr, opts_res.value()); } else { if (vm.count("novalidate")) { std::cerr << "Skipping server certificate validation" << std::endl; - lbman = LBManager(uri, false); + lbman = LBManager(uri, false, preferHostAddr); } } } @@ -489,8 +522,7 @@ int main(int argc, char **argv) // execute command auto uri_r = reserveLB(lbman, vm["lbname"].as(), vm["address"].as>(), - vm["duration"].as(), - suppress); + duration, suppress); if (uri_r.has_error()) { std::cerr << "There was an error reserving LB: " << uri_r.error().message() << std::endl; @@ -524,10 +556,10 @@ int main(int argc, char **argv) vm["name"].as(), vm["address"].as>()[0], vm["port"].as(), - vm["weight"].as(), - vm["count"].as(), - vm["minfactor"].as(), - vm["maxfactor"].as(), + weight, + count, + minfactor, + maxfactor, suppress ); @@ -561,7 +593,7 @@ int main(int argc, char **argv) } else if (vm.count("state")) { - auto int_r = sendState(lbman, vm["queue"].as(), vm["ctrl"].as(), vm["ready"].as()); + auto int_r = sendState(lbman, queue, ctrl, ready); if (int_r.has_error()) { std::cerr << "There was an error getting sending worker state update: " << int_r.error().message() << std::endl; diff --git a/bin/lbmonitor.cpp b/bin/lbmonitor.cpp index a6bc66b5..5052976e 100644 --- a/bin/lbmonitor.cpp +++ b/bin/lbmonitor.cpp @@ -13,7 +13,8 @@ using namespace e2sar; result getLBStatus(LBManager &lbman, const std::string &lbid) { std::cout << "Getting LB Status " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; std::cout << " LB Name: " << (lbman.get_URI().get_lbName().empty() ? "not set"s : lbman.get_URI().get_lbId()) << std::endl; std::cout << " LB ID: " << lbid << std::endl; @@ -49,7 +50,8 @@ result getLBStatus(LBManager &lbman, const std::string &lbid) result getLBOverview(LBManager &lbman) { std::cout << "Getting Overview " << std::endl; - std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " on IP " << lbman.get_URI().get_cpAddr().value().first << std::endl; + std::cout << " Contacting: " << lbman.get_URI().to_string(EjfatURI::TokenType::session) << " using address: " << + lbman.get_AddrString() << std::endl; auto res = lbman.overview(); @@ -101,7 +103,8 @@ int main(int argc, char **argv) opts("lbid,i", po::value(), "specify id of the loadbalancer as issued by reserve call instead of using what is in EJFAT_URI"); opts("root,o", po::value(), "root cert for SSL communications"); opts("novalidate,v", "don't validate server certificate (conflicts with 'root')"); - opts("ipv6,6", "prefer IPv6 control plane address if URI specifies hostname"); + opts("ipv6,6", "prefer IPv6 control plane address if URI specifies hostname (disables cert validation)"); + opts("ipv4,4", "prefer IPv4 control plane address if URI specifies hostname (disables cert validation)"); opts("uri,u", po::value(), "specify EJFAT_URI on the command-line instead of the environment variable"); opts("time,t", po::value(), "specify refresh time in ms (default is 5000ms)"); @@ -125,6 +128,11 @@ int main(int argc, char **argv) preferV6 = true; } + // if ipv4 or ipv6 requested explicitly + bool preferHostAddr = false; + if (vm.count("ipv6") || vm.count("ipv4")) + preferHostAddr = true; + std::string ejfat_uri; auto uri_r = (vm.count("uri") ? EjfatURI::getFromString(vm["uri"].as(), tt, preferV6) : EjfatURI::getFromEnv("EJFAT_URI"s, tt, preferV6)); @@ -135,7 +143,7 @@ int main(int argc, char **argv) } auto uri = uri_r.value(); - auto lbman = LBManager(uri); + auto lbman = LBManager(uri, true, preferHostAddr); if (vm.count("root") && !uri.get_useTls()) { @@ -151,14 +159,14 @@ int main(int argc, char **argv) std::cerr << "Unable to read server root certificate file "s; return -1; } - lbman = LBManager(uri, true, opts_res.value()); + lbman = LBManager(uri, true, preferHostAddr, opts_res.value()); } else { if (vm.count("novalidate")) { std::cerr << "Skipping server certificate validation" << std::endl; - lbman = LBManager(uri, false); + lbman = LBManager(uri, false, preferHostAddr); } } } diff --git a/docs b/docs index f6c7612a..cc336b83 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit f6c7612a9449a5827d883cc1171fafb0d3876a19 +Subproject commit cc336b8346707de63fb93017ecfd43abb500789b diff --git a/include/e2sarCP.hpp b/include/e2sarCP.hpp index 74c341aa..ee275e29 100644 --- a/include/e2sarCP.hpp +++ b/include/e2sarCP.hpp @@ -120,6 +120,7 @@ namespace e2sar { private: EjfatURI _cpuri; + std::string addr_string; std::unique_ptr _stub; std::shared_ptr _channel; @@ -131,16 +132,22 @@ namespace e2sar * use makeSslOptions[FromFiles]() methods and pass the output to this constructor. * @param cpuri an EjfatURI object parsed from configuration data * @param validateServer if false, skip server certificate validation (useful for self-signed testing) + * @param useHostAddress even if hostname is provided, use host address as resolved by URI object (with preference for + * IPv4 by default or for IPv6 if explicitly requested) * @param opts grpc::SslCredentialsOptions containing some combination of server root certs, client key and client cert * use of SSL/TLS is governed by the URI scheme ('ejfat' vs 'ejfats') */ - LBManager(const EjfatURI &cpuri, bool validateServer = true, + LBManager(const EjfatURI &cpuri, bool validateServer = true, bool useHostAddress = false, grpc::SslCredentialsOptions opts = grpc::SslCredentialsOptions()) : _cpuri(cpuri) { auto cp_host_r = cpuri.get_cpHost(); - std::string addr_string; - if (!cp_host_r.has_error()) + + // using host address automatically disables cert validation + if (useHostAddress) + validateServer = false; + + if (!useHostAddress && !cp_host_r.has_error()) { // try hostname auto cp_host_v = cp_host_r.value(); @@ -153,7 +160,10 @@ namespace e2sar if (cp_addr_r.has_error()) throw E2SARException("Unable to initialize LBManager due to missing CP address in URI"); auto cp_addr_v = cp_addr_r.value(); - addr_string = cp_addr_v.first.to_string() + ":" + std::to_string(cp_addr_v.second); + if (cp_addr_v.first.is_v4()) + addr_string = "ipv4:///" + cp_addr_v.first.to_string() + ":" + std::to_string(cp_addr_v.second); + else + addr_string = "ipv6:///[" + cp_addr_v.first.to_string() + "]:" + std::to_string(cp_addr_v.second); } if (cpuri.get_useTls()) @@ -500,7 +510,7 @@ namespace e2sar */ static inline result makeSslOptions(const std::string &pem_root_certs, const std::string &pem_private_key, - const std::string &pem_cert_chain) + const std::string &pem_cert_chain) noexcept { return grpc::SslCredentialsOptions{std::move(pem_root_certs), std::move(pem_private_key), @@ -519,14 +529,24 @@ namespace e2sar static result makeSslOptionsFromFiles( std::string_view pem_root_certs, std::string_view pem_private_key, - std::string_view pem_cert_chain); + std::string_view pem_cert_chain) noexcept; /** * Generate gRPC-compliant custom SSL Options object with just the server root cert * @param pem_root_certs - The file name containing the PEM encoding of the server root certificate. */ static result makeSslOptionsFromFiles( - std::string_view pem_root_certs); + std::string_view pem_root_certs) noexcept; + + /** + * Return the address string used by gRPC to connect to control plane. Can be + * in the format of hostname:port or ipv4:///W.X.Y.Z:port or ipv6:///[XXXX::XX:XXXX]:port + * + * @return the string containing the address + */ + inline std::string get_AddrString() { + return addr_string; + } }; /** diff --git a/include/e2sarDPReassembler.hpp b/include/e2sarDPReassembler.hpp index 34af96ad..7ea217c6 100644 --- a/include/e2sarDPReassembler.hpp +++ b/include/e2sarDPReassembler.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -129,6 +130,8 @@ namespace e2sar std::atomic dataErrCnt{0}; // last e2sar error std::atomic lastE2SARError{E2SARErrorc::NoError}; + // a limited queue to push lost event numbers to + boost::lockfree::queue*> lostEventsQueue{20}; }; AtomicStats recvStats; @@ -137,16 +140,20 @@ namespace e2sar boost::lockfree::queue eventQueue{QSIZE}; std::atomic eventQueueDepth{0}; - // push event on the event queue - inline void enqueue(EventQueueItem* item) noexcept + // push event on the common event queue + // return 1 if event is lost, 0 on success + inline int enqueue(EventQueueItem* item) noexcept { + int ret = 0; if (eventQueue.push(item)) eventQueueDepth++; else - recvStats.enqueueLoss++; // event lost, queue was full + ret = 1; // event lost, queue was full // queue is lock free so we don't lock recvThreadCond.notify_all(); + return ret; } + // pop event off the event queue inline EventQueueItem* dequeue() noexcept { @@ -166,6 +173,9 @@ namespace e2sar const float Kp; // PID proportional const float Ki; // PID integral const float Kd; // PID derivative + const float weight; // processing power factor + const float min_factor; + const float max_factor; struct PIDSample { UnixTimeMicro_t sampleTime; // in usec since epoch float error; @@ -206,6 +216,8 @@ namespace e2sar // segments are transmitted, they are guarangeed to go // to the same port boost::unordered_map, EventQueueItem*, pair_hash, pair_equal> eventsInProgress; + // thread local instance of events we lost + boost::container::flat_set> lostEvents; // CPU core ids std::vector cpuCoreList; @@ -225,15 +237,29 @@ namespace e2sar result _close(); // thread loop void _threadBody(); + + // log a lost event via a set of known lost + // and add to lost queue for external inspection + inline void logLostEvent(std::pair evt) + { + if (lostEvents.contains(evt)) + return; + // this is thread-local + lostEvents.insert(evt); + // lockfree queue (only takes trivial types) + std::pair *evtPtr = new std::pair(evt.first, evt.second); + reas.recvStats.lostEventsQueue.push(evtPtr); + // this is atomic + reas.recvStats.enqueueLoss++; + } }; friend struct RecvThreadState; std::list recvThreadState; // receive related parameters const std::vector cpuCoreList; - const ip::address dataIP; // from URI - const u_int16_t dataPort; // starting receive port from URI - const bool dpV6; // prefer V6 over v4 address from URI + const ip::address dataIP; + const u_int16_t dataPort; const int portRange; // translates into 2^portRange - 1 ports we listen to const size_t numRecvThreads; const size_t numRecvPorts; @@ -274,14 +300,12 @@ namespace e2sar boost::thread threadObj; const u_int16_t period_ms; - // flags - const bool useV6; // UDP sockets int socketFd{0}; - inline SendStateThreadState(Reassembler &r, bool v6, u_int16_t period_ms): - reas{r}, period_ms{period_ms}, useV6{v6} + inline SendStateThreadState(Reassembler &r, u_int16_t period_ms): + reas{r}, period_ms{period_ms} {} // thread loop. all important behavior is encapsulated inside LBManager @@ -319,9 +343,8 @@ namespace e2sar public: /** * Structure for flags governing Reassembler behavior with sane defaults - * - dpV6 - prefer the IPv6 address/port in the URI data address. Reassembler will bind to IPv6 instead of IPv4 address. {false} - * - cpV6 - use IPv6 address if cp node specified by name and has IPv4 and IPv6 resolution {false} * - useCP - whether to use the control plane (sendState, registerWorker) {true} + * - useHostAddress - use IPv4 or IPv6 address for gRPC even if hostname is specified (disables cert validation) {false} * - period_ms - period of the send state thread in milliseconds {100} * - epoch_ms - period of one epoch in milliseconds {1000} * - Ki, Kp, Kd - PID gains (integral, proportional and derivative) {0., 0., 0.} @@ -336,12 +359,16 @@ namespace e2sar * - eventTimeout_ms - how long (in ms) we allow events to remain in assembly before we give up {500} * - rcvSocketBufSize - socket buffer size for receiving set via SO_RCVBUF setsockopt. Note * that this requires systemwide max set via sysctl (net.core.rmem_max) to be higher. {3MB} + * - weight - weight given to this node in terms of processing power + * - min_factor - multiplied with the number of slots that would be assigned evenly to determine min number of slots + * for example, 4 nodes with a minFactor of 0.5 = (512 slots / 4) * 0.5 = min 64 slots + * - max_factor - multiplied with the number of slots that would be assigned evenly to determine max number of slots + * for example, 4 nodes with a maxFactor of 2 = (512 slots / 4) * 2 = max 256 slots set to 0 to specify no maximum */ struct ReassemblerFlags { - bool dpV6; - bool cpV6; bool useCP; + bool useHostAddress; u_int16_t period_ms; bool validateCert; float Ki, Kp, Kd, setPoint; @@ -350,10 +377,16 @@ namespace e2sar bool withLBHeader; int eventTimeout_ms; int rcvSocketBufSize; - ReassemblerFlags(): dpV6{false}, cpV6{false}, useCP{true}, + float weight, min_factor, max_factor; + ReassemblerFlags(): useCP{true}, useHostAddress{false}, period_ms{100}, validateCert{true}, Ki{0.}, Kp{0.}, Kd{0.}, setPoint{0.}, epoch_ms{1000}, portRange{-1}, withLBHeader{false}, eventTimeout_ms{500}, - rcvSocketBufSize{1024*1024*3} {} + rcvSocketBufSize{1024*1024*3}, weight{1.0}, min_factor{0.5}, max_factor{2.0} {} + /** + * Initialize flags from an INI file + * @param iniFile - path to the INI file + */ + static result getFromINI(const std::string &iniFile) noexcept; }; /** * Create a reassembler object to run receive on a specific set of CPU cores @@ -362,20 +395,25 @@ namespace e2sar * the number of cores on the list. For the started receive threads affinity will be * set to these cores. * @param uri - EjfatURI with lb_id and instance token, so we can register a worker and then SendState + * @param data_ip - IP address (v4 or v6) on which we are listening + * @param starting_port - starting port number on which we are listening * @param cpuCoreList - list of core identifiers to be used for receive threads * @param rflags - optional ReassemblerFlags structure with additional flags */ - Reassembler(const EjfatURI &uri, std::vector cpuCoreList, + Reassembler(const EjfatURI &uri, ip::address data_ip, u_int16_t starting_port, + std::vector cpuCoreList, const ReassemblerFlags &rflags = ReassemblerFlags()); /** * Create a reassembler object to run on a specified number of receive threads * without taking into account thread-to-CPU and CPU-to-NUMA affinity. * @param uri - EjfatURI with lb_id and instance token, so we can register a worker and then SendState + * @param data_ip - IP address (v4 or v6) on which we are listening + * @param starting_port - starting port number on which we are listening * @param numRecvThreads - number of threads * @param rflags - optional ReassemblerFlags structure with additional flags */ - Reassembler(const EjfatURI &uri, size_t numRecvThreads = 1, - const ReassemblerFlags &rflags = ReassemblerFlags()); + Reassembler(const EjfatURI &uri, ip::address data_ip, u_int16_t starting_port, + size_t numRecvThreads = 1, const ReassemblerFlags &rflags = ReassemblerFlags()); Reassembler(const Reassembler &r) = delete; Reassembler & operator=(const Reassembler &o) = delete; ~Reassembler() @@ -398,14 +436,9 @@ namespace e2sar /** * Register a worker with the control plane * @param node_name - name of this node (any unique string) - * @param weight - weight given to this node in terms of processing power - * @param min_factor - multiplied with the number of slots that would be assigned evenly to determine min number of slots - * for example, 4 nodes with a minFactor of 0.5 = (512 slots / 4) * 0.5 = min 64 slots - * @param max_factor - multiplied with the number of slots that would be assigned evenly to determine max number of slots - * for example, 4 nodes with a maxFactor of 2 = (512 slots / 4) * 2 = max 256 slots set to 0 to specify no maximum * @return - 0 on success or an error condition */ - result registerWorker(const std::string &node_name, float weight, float min_factor, float max_factor) noexcept; + result registerWorker(const std::string &node_name) noexcept; /** * Deregister this worker @@ -460,6 +493,23 @@ namespace e2sar recvStats.lastE2SARError); } + /** + * Try to pop an event number of a lost event from the queue that stores them + * @return result with either (eventNumber,dataId) or E2SARErrorc::NotFound if queue is empty + */ + inline result> get_LostEvent() noexcept + { + std::pair *res = nullptr; + if (recvStats.lostEventsQueue.pop(res)) + { + auto ret{*res}; + delete res; + return ret; + } + else + return E2SARErrorInfo{E2SARErrorc::NotFound, "Lost event queue is empty"}; + } + /** * Get the number of threads this Reassembler is using */ diff --git a/include/e2sarDPSegmenter.hpp b/include/e2sarDPSegmenter.hpp index 91b5e407..0e555325 100644 --- a/include/e2sarDPSegmenter.hpp +++ b/include/e2sarDPSegmenter.hpp @@ -286,6 +286,11 @@ namespace e2sar SegmenterFlags(): dpV6{false}, zeroCopy{false}, connectedSocket{true}, useCP{true}, syncPeriodMs{1000}, syncPeriods{2}, mtu{1500}, numSendSockets{4}, sndSocketBufSize{1024*1024*3} {} + /** + * Initialize flags from an INI file + * @param iniFile - path to the INI file + */ + static result getFromINI(const std::string &iniFile) noexcept; }; /** * Initialize segmenter state. Call openAndStart() to begin operation. diff --git a/include/e2sarHeaders.hpp b/include/e2sarHeaders.hpp index 11692962..7952d92f 100644 --- a/include/e2sarHeaders.hpp +++ b/include/e2sarHeaders.hpp @@ -163,7 +163,7 @@ namespace e2sar struct REHdr re; } __attribute__((__packed__)); - constexpr u_int8_t synchdrVersion = 1; + constexpr u_int8_t synchdrVersion = 2; /** The Syncr Header. You should always use the provided methods to set and interrogate fields as the structure maintains Big-Endian order diff --git a/include/e2sarNetUtil.hpp b/include/e2sarNetUtil.hpp index f60df460..461aa64c 100644 --- a/include/e2sarNetUtil.hpp +++ b/include/e2sarNetUtil.hpp @@ -29,6 +29,11 @@ namespace e2sar * @return MTU or 1500 as the best guess */ static u_int16_t getMTU(const std::string &interfaceName); + /** + * Get the hostname of the host + */ + static result getHostName(); + #ifdef NETLINK_CAPABLE /** * Get the outgoing interface and its MTU for a given IPv4 or IPv6 diff --git a/include/e2sarUtil.hpp b/include/e2sarUtil.hpp index 6c9f8348..e33556bf 100644 --- a/include/e2sarUtil.hpp +++ b/include/e2sarUtil.hpp @@ -78,7 +78,7 @@ namespace e2sar * @param uri - the URI string * @param tt - convert to this token type (admin, instance, session) * @param preferV6 - when connecting to the control plane, prefer IPv6 address - * (defaults to v4) + * if the name resolves to both (defaults to v4) */ EjfatURI(const std::string &uri, TokenType tt=TokenType::admin, bool preferV6=false); diff --git a/meson.build b/meson.build index c4f25c5e..a817d36d 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,5 @@ project('E2SAR', 'cpp', - version: '0.1.1', default_options : ['cpp_std=c++17']) + version: '0.1.2', default_options : ['cpp_std=c++17']) compiler = meson.get_compiler('cpp') diff --git a/reassembler_config.ini b/reassembler_config.ini new file mode 100644 index 00000000..de7938ce --- /dev/null +++ b/reassembler_config.ini @@ -0,0 +1,44 @@ +[general] +; whether to use the control plane (gRPC sendState, registerWorker) +useCP = true + +[control-plane] +; validate control plane TLS certificate in gRPC communications +validateCert = true +; force using address (v4 or v6) even if hostname specified in the URI +useHostAddress = false + +[data-plane] +; 2^portRange (0<=portRange<=14) listening ports will be open starting from dataPort. +; If -1, then the number of ports matches either the number of CPU cores or the number of threads. Normally +; this value is calculated based on the number of cores or threads requested, but +; it can be overridden here. Use with caution. +portRange = -1 +; expect LB header to be included (mainly for testing when withCP==false, +; as normally LB strips it off in normal operation) +withLBHeader = false +; how long (in ms) we allow events to remain in assembly before we give up +eventTimeoutMS = 500 +; socket buffer size for receiving set via SO_RCVBUF setsockopt. Note +; that this requires systemwide max set via sysctl (net.core.rmem_max) to be higher. +rcvSocketBufSize = 3145728 +; period of one epoch in milliseconds +epochMS = 1000 +; period of the send state thread in milliseconds +periodMS = 100 + +[pid] +; setPoint queue occupied percentage to which to drive the PID controller +setPoint = 0.0 +; PID gains (integral, proportional and derivative) +Ki = 0.0 +Kp = 0.0 +Kd = 0.0 +; schedule weight parameters +weight = 1.0 +; multiplied with the number of slots that would be assigned evenly to determine min number of slots +; for example, 4 nodes with a minFactor of 0.5 = (512 slots / 4) * 0.5 = min 64 slots +min_factor = 0.5 +; multiplied with the number of slots that would be assigned evenly to determine max number of slots +; for example, 4 nodes with a maxFactor of 2 = (512 slots / 4) * 2 = max 256 slots set to 0 to specify no maximum +max_factor = 2.0 diff --git a/scripts/notebooks/EJFAT/E2SAR-development-tester.ipynb b/scripts/notebooks/EJFAT/E2SAR-development-tester.ipynb index 4b7f575b..65076414 100644 --- a/scripts/notebooks/EJFAT/E2SAR-development-tester.ipynb +++ b/scripts/notebooks/EJFAT/E2SAR-development-tester.ipynb @@ -54,7 +54,7 @@ "\n", "# branches for UDPLBd and E2SAR that we want checked out on the VMs\n", "udplbd_branch = 'develop'\n", - "e2sar_branch = 'main'\n", + "e2sar_branch = 'e2sar-perf-with-cp'\n", "\n", "# which of the available config files to use with UDPLBd\n", "udplbd_config = 'lb_mock-tls.yml'\n", @@ -104,6 +104,8 @@ "from ipaddress import ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network\n", "import ipaddress\n", "\n", + "import json\n", + "\n", "fablib = fablib_manager() \n", "fablib.show_config();\n", "\n", @@ -142,7 +144,8 @@ "# this is the NIC to use\n", "nic_model = 'NIC_Basic'\n", "# the subnet should match IPs\n", - "subnet = IPv4Network(\"192.168.1.0/24\")\n", + "subnet_str = \"192.168.0.0/24\" \n", + "subnet = IPv4Network(subnet_str)\n", "\n", "def execute_single_node(node, commands):\n", " for command in commands:\n", @@ -157,7 +160,32 @@ " for n in node:\n", " execute_single_node(n, commands)\n", " else:\n", - " execute_single_node(node, commands)\n" + " execute_single_node(node, commands)\n", + "\n", + "# until fablib fixes this\n", + "def get_management_os_interface(node) -> str or None:\n", + " \"\"\"\n", + " Gets the name of the management interface used by the node's\n", + " operating system. \n", + "\n", + " :return: interface name\n", + " :rtype: String\n", + " \"\"\"\n", + " stdout, stderr = node.execute(\"sudo ip -j route list\", quiet=True)\n", + " stdout_json = json.loads(stdout)\n", + "\n", + " for i in stdout_json:\n", + " if i[\"dst\"] == \"default\":\n", + " return i[\"dev\"]\n", + "\n", + " stdout, stderr = node.execute(\"sudo ip -6 -j route list\", quiet=True)\n", + " stdout_json = json.loads(stdout)\n", + "\n", + " for i in stdout_json:\n", + " if i[\"dst\"] == \"default\":\n", + " return i[\"dev\"]\n", + "\n", + " return None\n" ] }, { @@ -493,7 +521,7 @@ " f\"cd E2SAR/build; EJFAT_URI='ejfats://udplbd@{cpnode_addr}:18347/lb/1?data=127.0.0.1&sync=192.168.88.199:1234' PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson test {e2sar_test_suite} --suite live --timeout 0 -j 1\"\n", "]\n", " \n", - "execute_commands([sender], commands)" + "execute_commands([sender, recver], commands)" ] }, { @@ -630,7 +658,7 @@ " f\"sudo sysctl net.core.wmem_max=536870912\",\n", " f\"sysctl net.core.wmem_max net.core.rmem_max\"\n", "]\n", - "execute_commands([sender, recver], commands);" + "execute_commands([sender, recver], commands)" ] }, { @@ -653,6 +681,32 @@ "stdout, stderr = sender.execute(f\"sudo ping -f -s 8972 -c 10 -M do {recver_addr}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "725b9489-2ddb-4f94-b761-6e2eafc7f876", + "metadata": {}, + "outputs": [], + "source": [ + "# We need to setup the firewall to allow traffic to pass to the receiver\n", + "\n", + "mgmt_iface_name = get_management_os_interface(recver)\n", + "data_iface = recver.get_interface(network_name=net_name)\n", + "data_iface_name = data_iface.get_os_interface()\n", + "\n", + "print(f'Adding {mgmt_iface_name} and lo and data interface to trusted zone')\n", + "commands = [\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface={data_iface_name}',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface=lo',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface={mgmt_iface_name}',\n", + " f'for i in $(sudo firewall-cmd --zone=public --list-services); do sudo firewall-cmd --zone=public --permanent --remove-service=$i; done',\n", + "]\n", + "commands.append(f'sudo firewall-cmd --reload')\n", + "commands.append(f'sudo firewall-cmd --list-all --zone=public')\n", + "\n", + "execute_commands([recver], commands)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -672,8 +726,8 @@ "numEvents = 10000 # number of events to send\n", "bufSize = 300 * 1024 * 1024 # 100MB send and receive buffers\n", "\n", - "recv_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -r -u '{e2sarPerfURI}' -d {recverDuration} -b {bufSize}\"\n", - "send_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -s -u '{e2sarPerfURI}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize}\"\n", + "recv_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -r -u '{e2sarPerfURI}' -d {recverDuration} -b {bufSize} --ip {recver_addr} --port 19522\"\n", + "send_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -s -u '{e2sarPerfURI}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize} --ip {sender_addr}\"\n", "\n", "# start the receiver for 10 seconds and log its output\n", "print(f'Executing command {recv_command} on receiver')\n", @@ -761,7 +815,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/scripts/notebooks/EJFAT/E2SAR-live-lb-tester.ipynb b/scripts/notebooks/EJFAT/E2SAR-live-lb-tester.ipynb new file mode 100644 index 00000000..1987d815 --- /dev/null +++ b/scripts/notebooks/EJFAT/E2SAR-live-lb-tester.ipynb @@ -0,0 +1,1253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "80a4ca6f-ab03-4de5-8488-8b67ba0ec4d1", + "metadata": {}, + "source": [ + "# Live LB on FABRIC \n", + "\n", + "This notebook helps sets up a sender and multiple receiver nodes on FABRIC such that they can communicate with the production LB. One of the nodes is designated as a sender and the rest as worker-receivers.\n", + "\n", + "See the following diagram:\n", + "\n", + "
\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce5948ab-7637-4fd7-a6c1-0c31e555928e", + "metadata": {}, + "source": [ + "## Preamble\n", + "\n", + "This code should *always* be executed regardless of whether you are starting a new slice or returning to an existing slice." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d876e2c-21b6-4ac0-9c56-eb04771da7c5", + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# EDIT THIS\n", + "#\n", + "\n", + "# GitHub SSH key file (private) registered using the GitHubSSH.ipynb notebook referenced above\n", + "github_key = '/home/fabric/work/fabric_confi/github_ecdsa'\n", + "\n", + "# Note for best management network IPv4 connectivity pick from\n", + "# 'UCSD', 'SRI', 'FIU' or 'TOKY' - these sites have\n", + "# IPv4. Other sites use IPv6 management and have trouble\n", + "# retrieving git-lfs artifacts.\n", + "\n", + "# ESnet-FABRIC gateway is at STAR, so the closer we are to it, the lower\n", + "# the latency and loss.\n", + "\n", + "# site_list_override = None\n", + "\n", + "# if you want to force a site list instead of using random\n", + "#site_list_override = ['SRI', 'UCSD', 'CLEM']\n", + "\n", + "# (super)core sites - should be low loss\n", + "site_list_override = ['STAR', 'SALT', 'KANS', 'NEWY', 'WASH', 'LOSA', 'DALL', 'ATLA']\n", + "\n", + "# grouped around STAR with optical connections to the backbone - should be low loss\n", + "#site_list_override = ['STAR', 'INDI', 'NCSA', 'MICH']\n", + "\n", + "# high capacity sites (may have losses at high bandwidth)\n", + "# site_list_override = ['STAR', 'INDI', 'NCSA', 'TACC', 'UCSD', 'PSC']\n", + "\n", + "# these we always exclude\n", + "site_exclude_list = ['EDUKY', 'EDC']\n", + "\n", + "# how many workers do we want? (in addition to one sender)\n", + "number_of_workers = 3\n", + "\n", + "# base distro 'ubuntu2[012]' or 'rocky[89]'\n", + "distro_name = 'ubuntu22'\n", + "distro_version = distro_name[-2:]\n", + "\n", + "# map from distro to image name\n", + "images = {\n", + " 'ubuntu20': 'default_ubuntu_20',\n", + " 'ununtu21': 'default_ubuntu_21',\n", + " 'ubuntu22': 'default_ubuntu_22',\n", + " 'rocky8': 'default_rocky_8',\n", + " 'rocky9': 'default_rocky_9',\n", + "}\n", + "\n", + "# note that the below is distribution specific ('ubuntu' for ubuntu and so on)\n", + "home_location = {\n", + " 'ubunt': '/home/ubuntu',\n", + " 'rocky' : '/home/rocky'\n", + "}[distro_name[:5]]\n", + "\n", + "vm_key_location = f'{home_location}/.ssh/github_ecdsa'\n", + "\n", + "# worker dimensions\n", + "node_attribs = {\n", + " 'cores': 8,\n", + " 'disk': 100,\n", + " 'ram': 24,\n", + " 'image': images[distro_name]\n", + "}\n", + "\n", + "# slice name\n", + "slice_name = f'{number_of_workers + 1}-node LB Tester Slice using {distro_name}'\n", + "\n", + "# these are subnets we want to be able to route to/from\n", + "# The list has the form ['192.168.100.0/24', '10.100.1.0/24']\n", + "external_subnets = []\n", + "\n", + "# these are the lists of destination ports we allow to be open on the FABNet interface\n", + "# for incoming traffic from different subnets. The dictionary has the form\n", + "# { '192.168.100.0/24': [22, 443] } - the key is the source subnet and the value\n", + "# is a list of destination ports allowed from that subnet\n", + "open_ports = {\n", + "}\n", + "\n", + "# additional accounts and their public keys - they get sudo rights and docker,\n", + "# their public keys are expected to reside under ssh-keys/ in a file\n", + "# named after the account.\n", + "# The list has the form of ['user1', 'user2'] where user1 and user2 accounts\n", + "# will be created on the system. Under ssh-keys/ there should be two files\n", + "# named 'user1' and 'user2' each containing the SSH public key for that user. \n", + "accounts = []\n", + "\n", + "# url of e2sar deps. Find the appropriate version for the OS at https://github.com/JeffersonLab/E2SAR/releases\n", + "e2sar_branch = \"e2sar-perf-with-cp\"\n", + "static_release_url = 'https://github.com/JeffersonLab/E2SAR/releases/download/' # don't need to change this\n", + "e2sar_dep_artifcat = 'e2sar-deps_0.1.1_amd64.deb'\n", + "e2sar_release_ver = 'E2SAR-0.1.1'\n", + "e2sar_dep_url = static_release_url + e2sar_release_ver + \"-\" + distro_name[:-2] + \"-\" + distro_version + \".04/\" + e2sar_dep_artifcat\n", + "\n", + "#\n", + "# SHOULDN'T NEED TO EDIT BELOW\n", + "#\n", + "# Preamble\n", + "import json\n", + "import time\n", + "from datetime import datetime\n", + "from datetime import timezone\n", + "from datetime import timedelta\n", + "\n", + "from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager\n", + "\n", + "from ipaddress import ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network\n", + "import ipaddress\n", + "\n", + "fablib = fablib_manager() \n", + "fablib.show_config();\n", + "\n", + "# gets prepended to site name - this network is per site\n", + "net_name_prefix = 'fabnetv4ext'\n", + "\n", + "# this is the NIC to use\n", + "nic_model = 'NIC_Basic'\n", + "\n", + "def execute_single_node(node, commands):\n", + " for command in commands:\n", + " print(f'\\tExecuting \"{command}\" on node {node.get_name()}')\n", + " #stdout, stderr = node.execute(command, quiet=True, output_file=node.get_name() + '_install.log')\n", + " stdout, stderr = node.execute(command)\n", + " if not stderr and len(stderr) > 0:\n", + " print(f'Error encountered with \"{command}\": {stderr}')\n", + " \n", + "def execute_commands(node, commands):\n", + " if isinstance(node, list):\n", + " for n in node:\n", + " execute_single_node(n, commands)\n", + " else:\n", + " execute_single_node(node, commands)\n", + "\n", + "def execute_single_node_on_thread(node, commands):\n", + " # concatenate the commands using ';' and execute\n", + " allcommands = ';'.join(commands)\n", + " node.execute_thread(allcommands, output_file=node.get_name() + '_thread.log')\n", + "\n", + "def execute_commands_on_threads(node, commands):\n", + " if isinstance(node, list):\n", + " for n in node:\n", + " execute_single_node_on_thread(n, commands)\n", + " else:\n", + " execute_single_node_on_thread(node, commands)\n", + "\n", + "def make_node_name(site_name, node_idx):\n", + " return '_'.join([f\"Worker{node_idx}\", site_name])\n", + "\n", + "def make_net_name(site_name):\n", + " return '_'.join([net_name_prefix, site_name])\n", + "\n", + "# return slice with one node on one site\n", + "def starter_slice(site_name):\n", + " #node_name = make_node_name(site_name, 1)\n", + " node_name = '_'.join([\"Sender\", site_name])\n", + " net_name = make_net_name(site_name)\n", + "\n", + " slice = fablib.new_slice(name=slice_name)\n", + " node = slice.add_node(name=node_name, site=site_name, **node_attribs)\n", + "\n", + " # postboot configuration is under 'post-boot' directory\n", + " node.add_post_boot_upload_directory('post-boot','.')\n", + " node.add_post_boot_execute(f'chmod +x post-boot/sender.sh && ./post-boot/sender.sh')\n", + " \n", + " # attach to network\n", + " nic_interface = node.add_component(model=nic_model, name='_'.join([node_name, nic_model, 'nic'])).get_interfaces()[0]\n", + " net = slice.add_l3network(name=net_name, interfaces=[nic_interface], type='IPv4Ext')\n", + "\n", + " return slice\n", + "\n", + "def add_node_to_slice(site_name, node_idx, inc, slice):\n", + "\n", + " net_name = make_net_name(site_name)\n", + "\n", + " while inc > 0:\n", + " node_name = make_node_name(site_name, node_idx)\n", + " node_idx += 1\n", + " \n", + " node = slice.add_node(name=node_name, site=site_name, **node_attribs)\n", + " \n", + " # postboot configuration is under 'post-boot' directory\n", + " node.add_post_boot_upload_directory('post-boot','.')\n", + " node.add_post_boot_execute(f'chmod +x post-boot/recver.sh && ./post-boot/recver.sh')\n", + " \n", + " nic_interface = node.add_component(model=nic_model, name='_'.join([node_name, nic_model, 'nic'])).get_interfaces()[0]\n", + " \n", + " # attach to a network, create network if needed\n", + " net = slice.get_network(name=net_name)\n", + " if net is None:\n", + " net = slice.add_l3network(name=net_name, type='IPv4Ext')\n", + " \n", + " net.add_interface(nic_interface)\n", + " inc -= 1\n", + "\n", + " return None\n", + "\n", + "def check_modify(slice, selected_site_list, nodes_in_slice, expected_to_add):\n", + "\n", + " success = True\n", + " idx = 1\n", + " while(expected_to_add >= idx):\n", + " # find sliver reservation for new node\n", + " node_sliver = slice.list_slivers(fields=['name', 'state'], \n", + " filter_function=lambda x: x['type'] == 'node' and \n", + " x['name'] == make_node_name(selected_site_list[0], nodes_in_slice + idx) and \n", + " x['state'] == 'Active')\n", + " # if it is none - it failed\n", + " if node_sliver is None:\n", + " success = False\n", + " break\n", + " else:\n", + " idx += 1\n", + "\n", + " return success\n", + "\n", + "# until fablib fixes this\n", + "def get_management_os_interface(node) -> str or None:\n", + " \"\"\"\n", + " Gets the name of the management interface used by the node's\n", + " operating system. \n", + "\n", + " :return: interface name\n", + " :rtype: String\n", + " \"\"\"\n", + " stdout, stderr = node.execute(\"sudo ip -j route list\", quiet=True)\n", + " stdout_json = json.loads(stdout)\n", + "\n", + " for i in stdout_json:\n", + " if i[\"dst\"] == \"default\":\n", + " return i[\"dev\"]\n", + "\n", + " stdout, stderr = node.execute(\"sudo ip -6 -j route list\", quiet=True)\n", + " stdout_json = json.loads(stdout)\n", + "\n", + " for i in stdout_json:\n", + " if i[\"dst\"] == \"default\":\n", + " return i[\"dev\"]\n", + "\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "id": "6c66e2c1-84d0-4228-b552-601b2909c237", + "metadata": {}, + "source": [ + "## Helpers\n", + "\n", + "If you ever forget which images are available, run this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38130b75-8dc8-43c8-83cb-497a4768fc53", + "metadata": {}, + "outputs": [], + "source": [ + "# List available images (this step is optional)\n", + "available_images = fablib.get_image_names()\n", + "\n", + "print(f'Available images are: {available_images}')" + ] + }, + { + "cell_type": "markdown", + "id": "fbd114ff-be4f-4be6-8326-e7b088a27e3c", + "metadata": {}, + "source": [ + "## Prepare to create a new slice (skip if exists)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec766700-8245-4e4e-bf15-9630f4600456", + "metadata": {}, + "outputs": [], + "source": [ + "# list all slices I have running\n", + "output_dataframe = fablib.list_slices(output='pandas')\n", + "if output_dataframe:\n", + " print(output_dataframe)\n", + "else:\n", + " print('No active slices under this project')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4249e1-42aa-4575-8053-0edf4d95c3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Identify sites in continental US we want to use (NOOP if override is set)\n", + "lon_west=-124.3993243\n", + "lon_east=-69.9721573\n", + "candidate_sites = 7\n", + "free_nodes_worth = 3 # how many nodes worth are we looking per site\n", + "\n", + "# get a list of random sites, avoiding thos on the exclude list\n", + "# unless there is an override\n", + "if site_list_override is None:\n", + " selected_site_list = fablib.get_random_sites(count=candidate_sites, avoid=site_exclude_list,\n", + " filter_function=lambda x: x['location'][1] < lon_east\n", + " and x['location'][1] > lon_west \n", + " and x['cores_available'] > free_nodes_worth * node_attribs['cores']\n", + " and x['ram_available'] > free_nodes_worth * node_attribs['ram'] \n", + " and x['disk_available'] > free_nodes_worth * node_attribs['disk']) \n", + "else:\n", + " selected_site_list = site_list_override\n", + "\n", + "if selected_site_list:\n", + " print(f'Selected sites are {selected_site_list}')\n", + "else:\n", + " print('Unable to find a sites matching the requirements')\n" + ] + }, + { + "cell_type": "markdown", + "id": "e7ebd074-4107-483e-b369-79ccd1f99184", + "metadata": {}, + "source": [ + "## Create slice iteratively (skip if exists)\n", + "\n", + "We may or may not get all the nodes we want immediately - we use iteration with slice modify to get to the max/desired number of nodes across the selected sites." + ] + }, + { + "cell_type": "markdown", + "id": "2e9bb82e-6db0-472a-aa3d-66cba06c3d08", + "metadata": {}, + "source": [ + "### Create Starter Slice" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "332540f0-89f5-4038-a28c-d02a454215c4", + "metadata": {}, + "outputs": [], + "source": [ + "# we start by establishing a slice with one sender node at some site, we keep track which sites we failed \n", + "# and don't try those again\n", + "\n", + "keep_trying = True\n", + "succeeded = False\n", + "\n", + "site_list_iter = iter(selected_site_list)\n", + "failed_sites = {}\n", + "site_name = None\n", + "\n", + "while keep_trying:\n", + "\n", + " try:\n", + " site_name = next(site_list_iter)\n", + " print(f'Trying site {site_name} from {selected_site_list}')\n", + " \n", + " # define a starter slice\n", + " slice = starter_slice(site_name)\n", + "\n", + " print(f'Submitting starter slice \"{slice_name}\" with sender on site {site_name}')\n", + " slice_id = slice.submit()\n", + "\n", + " # check the state of this slice\n", + " slices = fablib.get_slices(excludes=[], slice_id=slice_id)\n", + " if slices[0].get_state() == 'Dead':\n", + " print(f'Failed on site {site_name}, proceeding')\n", + " else:\n", + " print(f'Succeeded on site {site_name} with state {slices[0].get_state()}')\n", + " keep_trying = False\n", + " succeeded = True\n", + " except StopIteration: \n", + " print('No more sites to look at, exiting')\n", + " keep_trying = False\n", + " except Exception as e:\n", + " print(f'Unexpected exception {e}, exiting')\n", + " keep_trying = False\n", + "\n", + "if succeeded:\n", + " print(f'Succeeded in creating a slice on {site_name}, will avoid sites {failed_sites}')\n", + " selected_site_list = list(filter(lambda x: x not in failed_sites, selected_site_list))\n", + " print(f'Proceeding with sites {selected_site_list}')" + ] + }, + { + "cell_type": "markdown", + "id": "c50070d1-b89f-427e-9a5d-7af7605c8610", + "metadata": {}, + "source": [ + "### Modify the Slice to add Workers" + ] + }, + { + "cell_type": "markdown", + "id": "b9270746-f34e-44d6-9c0a-ecc05542a9d0", + "metadata": {}, + "source": [ + "Now that the base with the sender slice is created we will iteratively add workers on sites one at a time using first-fit policy until we get to the desired number of workers or run out of sites." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "341fd051-3dee-4c0c-8df8-f4222f1068fd", + "metadata": {}, + "outputs": [], + "source": [ + "remaining_workers = number_of_workers\n", + "node_idx = 1\n", + "node_increment = 3\n", + "nodes_in_slice = 0 # we don't count sender in this case\n", + "\n", + "while remaining_workers > 0 and len(selected_site_list) > 0:\n", + " slice = fablib.get_slice(name=slice_name)\n", + " \n", + " try:\n", + " site_name = selected_site_list[0]\n", + " print(f'There are {remaining_workers} remaining workers to create. Trying site {site_name} from {selected_site_list}')\n", + " expected_to_add = node_increment if remaining_workers >= node_increment else remaining_workers\n", + " add_node_to_slice(site_name, node_idx, expected_to_add, slice)\n", + " \n", + " print(f'Submitting slice modification to \"{slice_name}\" to add {expected_to_add} nodes for site {site_name}')\n", + " slice_id = slice.modify()\n", + " \n", + " # check the state of this slice\n", + " slice = fablib.get_slice(name=slice_name)\n", + "\n", + " if check_modify(slice, selected_site_list, nodes_in_slice, expected_to_add):\n", + " print(f'Succeeded adding {expected_to_add} nodes on site {site_name}.')\n", + " # successfully provisioned\n", + " node_idx += expected_to_add\n", + " remaining_workers -= expected_to_add\n", + " nodes_in_slice += expected_to_add\n", + " else:\n", + " print(f'Failed to provision on site {site_name}.')\n", + " # this site is full, moving on\n", + " selected_site_list.remove(site_name) \n", + " except Exception as e:\n", + " remaining_workers = -1\n", + " print(f'Unexpected exception {e}, exiting')\n", + " break\n", + "\n", + "if remaining_workers == 0:\n", + " print('Succeeded in creating all workers')\n", + "else:\n", + " print(f'Unable to create {remaining_workers}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "3fa05d6d-ac6d-47f0-b351-4401b14c05c3", + "metadata": {}, + "source": [ + "## Get Slice Details (always execute)\n", + "\n", + "The following code sets up data structures so all the follow up cells work properly. Execute it regardless of whether you just created the slice or coming back to an existing slice." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6698360b-1118-4fa2-ace0-09bdf3f99a1f", + "metadata": {}, + "outputs": [], + "source": [ + "def find_net(net_list, name):\n", + " for net in net_list:\n", + " if net.get_name() == name:\n", + " return net\n", + " return None\n", + "\n", + "# get slice details \n", + "slice = fablib.get_slice(name=slice_name)\n", + "\n", + "a = slice.show()\n", + "nets = slice.list_networks()\n", + "nodes = slice.list_nodes()\n", + "\n", + "# arrange nodes and network services by site for future convenience\n", + "net_objects = slice.get_networks()\n", + "node_objects = slice.get_nodes()\n", + "available_ip_cnt = 10\n", + "\n", + "slivers_by_site = dict()\n", + "\n", + "print('Arranging nodes and networks by site and getting available IP addresses')\n", + "for node in node_objects:\n", + " node_site = node.get_site()\n", + " if not slivers_by_site.get(node_site):\n", + " slivers_by_site[node_site] = dict()\n", + " slivers_by_site[node_site]['nodes'] = set()\n", + " slivers_by_site[node_site]['net'] = find_net(net_objects, make_net_name(node_site))\n", + " slivers_by_site[node_site]['nodes'].add(node)\n", + "\n", + "print('Listing public IP addresses per service')\n", + "for net in net_objects:\n", + " print(f'{net.get_name()} has {net.get_public_ips()}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "0f34ea09-c6bb-4bed-b831-35e0f5cf65e8", + "metadata": {}, + "source": [ + "## Perform Hardening and Network Configuration Opening to Outside World" + ] + }, + { + "cell_type": "markdown", + "id": "0f49f98c-d88f-4622-b7da-aa4cf797b100", + "metadata": {}, + "source": [ + "### Set up routing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d92b9b99-defb-4102-a412-fd388763d19d", + "metadata": {}, + "outputs": [], + "source": [ + "# allocate externally routable IP addresses in each site network services\n", + "# it is NORMAL to see 'IP addresses were updated due to conflicts'\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " print(f'Processing {site_name}')\n", + " site_net = site_slivers['net']\n", + " site_nodes = site_slivers['nodes']\n", + " site_slivers['ips'] = site_net.get_available_ips(count=len(site_nodes))\n", + " print(f'Requesting available IPs to be publicly routable: {site_slivers[\"ips\"]}')\n", + " site_net.make_ip_publicly_routable(ipv4=[str(x) for x in site_slivers['ips']])\n", + "\n", + "slice.submit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34cc4504-46c7-4807-92cd-28053fc63a11", + "metadata": {}, + "outputs": [], + "source": [ + "# get slice details \n", + "slice = fablib.get_slice(name=slice_name)\n", + "\n", + "# check the results\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " print(f'Processing {site_name}')\n", + " site_net = site_slivers['net']\n", + " site_nodes = site_slivers['nodes']\n", + " print(f'Public IPs are: {site_net.get_public_ips()}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bafe7c3d-2d25-47b6-9403-c9b01b4492fb", + "metadata": {}, + "outputs": [], + "source": [ + "# configure node interfaces with these IP addresses\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " print(f'Processing {site_name}')\n", + " site_net = site_slivers['net']\n", + " site_nodes = site_slivers['nodes']\n", + " site_addrs = site_net.get_public_ips()\n", + " for node, addr in zip(site_nodes, site_addrs):\n", + " print(f' Adding address {addr} to node {node.get_name()} in subnet {site_net.get_subnet()}')\n", + " # make sure the interface is UP (in rare cases comes up in DOWN state)\n", + " node_iface = node.get_interface(network_name = site_net.get_name())\n", + " execute_single_node(node, [f'sudo ip link set {node_iface.get_os_interface()} up'])\n", + " node_iface.ip_addr_add(addr=addr, subnet=site_net.get_subnet())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c5a0e5c-f332-42be-9877-ee385eea2128", + "metadata": {}, + "outputs": [], + "source": [ + "# configure inter-site routing if you have multiple sites\n", + "for site_name_from, site_slivers_from in slivers_by_site.items():\n", + " for site_name_to, site_slivers_to in slivers_by_site.items():\n", + " if site_name_from == site_name_to:\n", + " continue\n", + " # make sure nodes in site_name_from have a route to site_name_to subnet\n", + " subnet = site_slivers_to['net'].get_subnet()\n", + " gateway = site_slivers_from['net'].get_gateway()\n", + " for node in site_slivers_from['nodes']:\n", + " print(f'Setting up route to {subnet} via {gateway} on node {node.get_name()}')\n", + " node.ip_route_add(subnet=subnet, gateway=gateway)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7808808c-6630-4368-8314-d10779781427", + "metadata": {}, + "outputs": [], + "source": [ + "# configure global routing to indicated subnets \n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " gateway = site_slivers['net'].get_gateway()\n", + " for node in site_slivers['nodes']:\n", + " print(f'Setting up routes on {node.get_name()}')\n", + " for subnet in external_subnets:\n", + " print(f'Setting up route to {subnet} via {gateway} on node {node.get_name()}')\n", + " execute_single_node(node, [f'sudo ip route add {subnet} via {gateway}'])" + ] + }, + { + "cell_type": "markdown", + "id": "d188265a-7204-493c-acf8-4cdf34f10252", + "metadata": {}, + "source": [ + "### Setup Firewall (assuming firewalld is used regardless of distro)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fee04f45-5f8f-4d08-a3eb-177d79c920f2", + "metadata": {}, + "outputs": [], + "source": [ + "# walk the nodes, add lo and management interface to 'trusted' zone where everything is allowed\n", + "# add dataplane interface into 'public' zone where only 'open ports' from specific sources is allowed\n", + "\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " site_net = site_slivers['net']\n", + " for node in site_slivers['nodes']:\n", + " print(f'Setting up firewalld on node {node.get_name()}')\n", + " # note we are calling our own function - as of 1.7.0 fablib's node.get_management_os_interface()\n", + " # has a bug where it doesn't find management interface on IPv6 sites\n", + " mgmt_iface_name = get_management_os_interface(node)\n", + " if mgmt_iface_name is None:\n", + " print('Unable to determine management interface, skipping')\n", + " continue\n", + " data_iface = node.get_interface(network_name=site_net.get_name())\n", + " data_iface_name = data_iface.get_os_interface()\n", + " print(f' Adding {mgmt_iface_name} and lo to trusted zone and {data_iface_name} to public zone')\n", + " commands = [\n", + " f'sudo firewall-cmd --permanent --zone=public --add-interface={data_iface_name}',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface=lo',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface={mgmt_iface_name}',\n", + " f'for i in $(sudo firewall-cmd --zone=public --list-services); do sudo firewall-cmd --zone=public --permanent --remove-service=$i; done',\n", + " ]\n", + " for subnet, portlist in open_ports.items():\n", + " for port in portlist:\n", + " commands.append(f'sudo firewall-cmd --permanent --zone=public --add-rich-rule=\\'rule family=\\\"ipv4\\\" source address=\\\"{subnet}\\\" port protocol=\\\"tcp\\\" port=\\\"{port}\\\" accept\\'')\n", + " for subnet in external_subnets:\n", + " commands.append(f'sudo firewall-cmd --permanent --zone=public --add-rich-rule=\\'rule family=\\\"ipv4\\\" source address=\\\"{subnet}\\\" protocol value=\\\"udp\\\" accept\\'')\n", + " commands.append(f'sudo firewall-cmd --reload')\n", + " commands.append(f'sudo firewall-cmd --list-all --zone=public')\n", + " execute_single_node(node, commands)\n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "45c614a4-b6a2-48bb-9f6e-4ce9b9677ddd", + "metadata": {}, + "source": [ + "## Tune Buffers and MTUs\n", + "\n", + "In order to have good performance we need to\n", + "- Make the UDP send/receive socket buffer size limit larger (applications are assumed to know how to make their buffers larger up to this limit)\n", + "- Set MTU to 9k and test with DF=0 ping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72c91079-9ce3-4d73-b945-182ffe85dd5a", + "metadata": {}, + "outputs": [], + "source": [ + "# setup UDP socket buffer sizes to 512M\n", + "commands = [\n", + " f\"sudo sysctl net.core.rmem_max=536870912\",\n", + " f\"sudo sysctl net.core.wmem_max=536870912\",\n", + " f\"sysctl net.core.wmem_max net.core.rmem_max\"\n", + "]\n", + "# walk the nodes\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " for node in site_slivers['nodes']:\n", + " execute_single_node(node, commands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0472d539-252c-4d50-90f8-bb7f9cdd1beb", + "metadata": {}, + "outputs": [], + "source": [ + "# set 9k MTU on dataplane interfaces\n", + "mtu=9000\n", + "\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " site_net = site_slivers['net']\n", + " for node in site_slivers['nodes']:\n", + " data_iface = node.get_interface(network_name=site_net.get_name())\n", + " data_iface_name = data_iface.get_os_interface()\n", + " execute_single_node(node, [f\"sudo ip link set dev {data_iface_name} mtu {mtu}\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f5ef68c-6da4-4ce7-baee-a8efe7d61d3e", + "metadata": {}, + "outputs": [], + "source": [ + "# run a no-DF test from every node to the first public address of the first site on the list\n", + "first_ip = list(slivers_by_site.items())[0][1]['net'].get_public_ips()[0]\n", + "# you can replace first_ip with the IP of a load balancer, but be careful not to interfere\n", + "# with a running experiment as this uses ping flood \n", + "first_ip = \"192.188.29.1\"\n", + "\n", + "for site_name, site_slivers in slivers_by_site.items():\n", + " for node in site_slivers['nodes']:\n", + " print(f'Node {node.get_name()} pinging {first_ip}')\n", + " execute_single_node(node, [f\"sudo ping -q -f -s 8972 -c 100 -M do {first_ip}\"])" + ] + }, + { + "cell_type": "markdown", + "id": "4551c7f3-c87c-4216-9f66-3e10eba38f90", + "metadata": {}, + "source": [ + "## Customize Nodes\n", + "\n", + "Customize node setup by adding E2SAR installation" + ] + }, + { + "cell_type": "markdown", + "id": "527b3798-4fb4-447a-a07f-3885936bc55f", + "metadata": {}, + "source": [ + "### Add E2SAR software" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84ad8f6b-d909-4076-803e-8adfb9ee9cc5", + "metadata": {}, + "outputs": [], + "source": [ + "# install github ssh key and set up build environment variables for interactive logins\n", + "commands = [\n", + " f\"chmod go-rwx {vm_key_location}\",\n", + " # Meson won't detect boost by merely setting cmake_prefix_path, instead set BOOST_ROOT env variable \n", + " # for gRPC it is enough to set -Dpkg_config_path option to meson\n", + " f\"echo 'export BOOST_ROOT=/usr/local/ LD_LIBRARY_PATH=/usr/local/lib' >> ~/.profile\",\n", + " f\"echo 'export BOOST_ROOT=/usr/local/ LD_LIBRARY_PATH=/usr/local/lib' >> ~/.bashrc\",\n", + "]\n", + "\n", + "for node in slice.get_nodes(): \n", + " # upload the GitHub SSH key onto the VM\n", + " result = node.upload_file(github_key, vm_key_location)\n", + " execute_commands(node, commands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7200a034-e505-41d5-8f5e-6141b6b6cf38", + "metadata": {}, + "outputs": [], + "source": [ + "# download boost and grpc dependencies from releases\n", + "commands = [\n", + " f\"wget -q -O boost_grpc.deb {e2sar_dep_url}\",\n", + " #f\"sudo apt -yq install ./boost_grpc.deb\",\n", + " f\"sudo dpkg -i ./boost_grpc.deb\"\n", + "]\n", + " \n", + "execute_commands(slice.get_nodes(), commands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "243ecc28-3208-43ff-9bd8-49bf74be79af", + "metadata": {}, + "outputs": [], + "source": [ + "# checkout E2SAR (including the right branch) using that key, install grpc and boost binary that is stored in the repo\n", + "commands = [\n", + " f\"GIT_SSH_COMMAND='ssh -i {vm_key_location} -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' git clone --recurse-submodules --depth 1 -b {e2sar_branch} git@github.com:JeffersonLab/E2SAR.git\",\n", + "]\n", + " \n", + "execute_commands(slice.get_nodes(), commands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89f7a098-c295-4a77-8096-f878ffb92015", + "metadata": {}, + "outputs": [], + "source": [ + "# compile and test E2SAR code\n", + "# note that most live tests only need the simplest URI - ejfats://token@ip:port/\n", + "# however the e2sar_reas_live_test requires data and sync addresses, and data address must\n", + "# be real (so we use loopback). Hence the long form of the URI for live tests \n", + "# (other tests simply ignore the parts of the URI they don't need.)\n", + "\n", + "commands = [\n", + " f\"cd E2SAR; rm -rf build; PATH=$HOME/.local/bin:/usr/local/bin:$PATH BOOST_ROOT=/usr/local/ LD_LIBRARY_PATH=/usr/local/lib/ meson setup -Dpkg_config_path=/usr/local/lib/pkgconfig/:/usr/lib/lib64/pkgconfig/ --prefix {home_location}/e2sar-install build && sed -i 's/-std=c++11//g' build/build.ninja\",\n", + " f\"PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson compile -j 8 -C build\",\n", + " f\"PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson test --suite unit --timeout 0 -C build\",\n", + "]\n", + "\n", + "# NOTE THIS EXECUTES ON THREADS IN PARALLEL, CHECK THE LOG FILES (NodeName_thread_log.log)\n", + "execute_commands_on_threads(slice.get_nodes(), commands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "025be1fa-c926-4cf1-aec1-57c37e526f12", + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# if you need to update already cloned repo\n", + "#\n", + "# update the code, compile and test\n", + "# note that most live tests only need the simplest URI - ejfats://token@ip:port/\n", + "# however the e2sar_reas_live_test requires data and sync addresses, and data address must\n", + "# be real (so we use loopback). Hence the long form of the URI for live tests \n", + "# (other tests simply ignore the parts of the URI they don't need.)\n", + "\n", + "commands = [\n", + " f\"cd E2SAR; GIT_SSH_COMMAND='ssh -i {vm_key_location} -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' git pull origin {e2sar_branch}\",\n", + " f\"cd E2SAR; BOOST_ROOT=/usr/local/ PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson setup -Dpkg_config_path=/usr/local/lib/pkgconfig/:/usr/lib/lib64/pkgconfig/ --prefix {home_location}/e2sar-install build --wipe && sed -i 's/-std=c++11//g' build/build.ninja\",\n", + " f\"cd E2SAR/build; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson compile -j 8\",\n", + "# f\"cd E2SAR/build; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ meson test {e2sar_test_suite} --suite unit --timeout 0\",\n", + "]\n", + " \n", + "execute_commands(slice.get_nodes(), commands)" + ] + }, + { + "cell_type": "markdown", + "id": "3e19f887-0a99-488e-8b9e-b72d5247f425", + "metadata": {}, + "source": [ + "## Run Tests\n", + "\n", + "### Run simple single-threaded performance test\n", + "\n", + "Start Segmenter on Sender node and one Reassembler on Worker1 and test throughput **without a real load balancer**. Reassembler is told that LB header will be present and it ignores it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3409e1dd-d1fc-4a2e-ae48-46bd76881bdf", + "metadata": {}, + "outputs": [], + "source": [ + "sender = list(filter(lambda n: n.get_name()[0:6] == \"Sender\", slice.get_nodes()))[0]\n", + "recver = list(filter(lambda n: n.get_name()[0:7] == \"Worker1\", slice.get_nodes()))[0]\n", + "\n", + "sender_addr = sender.get_interface(network_name=make_net_name(sender.get_site())).get_ip_addr()\n", + "recver_addr = recver.get_interface(network_name=make_net_name(recver.get_site())).get_ip_addr()\n", + "print(f\"Sender sending from {sender_addr}, receiver receiving on {recver_addr}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a9a3dcd-c367-4240-9cea-f2bbb45b2ce0", + "metadata": {}, + "outputs": [], + "source": [ + "# for e2sar_perf only the data= part of the query is meaningful. sync= must be present but is ignored\n", + "# same for gRPC token, address and port (and lb id)\n", + "e2sarPerfURI = f\"ejfat://useless@10.10.10.10:1234/lb/1?data={recver_addr}&sync=192.168.77.7:1234\"\n", + "recverDuration = 20\n", + "mtu = 9000\n", + "rate = 15 # Gbps\n", + "length = 1000000 # event length in bytes\n", + "numEvents = 10000 # number of events to send\n", + "bufSize = 300 * 1024 * 1024 # 100MB send and receive buffers\n", + "\n", + "recv_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -r -u '{e2sarPerfURI}' -d {recverDuration} -b {bufSize} --ip {recver_addr} --port 19522\"\n", + "send_command = f\"cd E2SAR; PATH=$HOME/.local/bin:/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib/ ./build/bin/e2sar_perf -s -u '{e2sarPerfURI}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize} --ip {sender_addr}\"\n", + "\n", + "# start the receiver for 10 seconds and log its output\n", + "print(f'Executing command {recv_command} on receiver')\n", + "recver.execute_thread(recv_command, output_file=f\"{recver.get_name()}.perf.log\")\n", + "\n", + "# sleep 2 seconds to let receiver get going\n", + "time.sleep(2)\n", + "\n", + "# start the sender in the foreground\n", + "print(f'Executing command {send_command} on sender')\n", + "stdout_send, stderr_send = sender.execute(send_command, output_file=f\"{sender.get_name()}.perf.log\")\n", + "\n", + "print(f\"Inspect {recver.get_name()}.perf.log file in your Jupyter container to see the results\")" + ] + }, + { + "cell_type": "markdown", + "id": "46c04a61-eebd-4189-9848-1c787995919e", + "metadata": {}, + "source": [ + "### Reserve the Load Balancer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "531e7e6b-dfd3-44b8-9d92-1df7b9887cae", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the admin URI\n", + "ejfat_admin_uri = 'ejfats://replace_me' # cut and paste here\n", + "\n", + "lb_path = './E2SAR/build/bin'\n", + "ld_library_path = \"LD_LIBRARY_PATH=/usr/local/lib\"\n", + "bin_path = \"PATH=./E2SAR/build/bin:$PATH\"\n", + "# note we are forcing IPv4 here with -4 option - from FABRIC this is necessary\n", + "lbadm = f\"{ld_library_path} {bin_path} lbadm -4\"\n", + "e2sar_perf = f\"{ld_library_path} {bin_path} e2sar_perf\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95678748-722e-4e95-8b2c-c2363dc523d1", + "metadata": {}, + "outputs": [], + "source": [ + "# run an overview command to see what is reserved\n", + "# we use sender node but any node can be used for admin commands\n", + "\n", + "command = f\"{lbadm} --overview -u {ejfat_admin_uri}\"\n", + "\n", + "execute_commands(sender, [command])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e96478f5-ca90-4ad7-9544-56330f59e8a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Reserve the load balancer\n", + "lbname = 'e2sar-testlb'\n", + "duration = '02:00:00' # 2 hours\n", + "\n", + "command = f\"{lbadm} --reserve -l {lbname} -a '{sender_addr}' -d {duration} -u '{ejfat_admin_uri}' -e\"\n", + "execute_commands(sender, [command])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acd72bab-8255-486d-a9cd-f753a8a707dd", + "metadata": {}, + "outputs": [], + "source": [ + "# copy the 'Updated URI after reserve with instance token' from the above result here:\n", + "instance_uri = 'ejfats://replace_me'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7ed79af-9715-4fe5-b416-b769bb69503d", + "metadata": {}, + "outputs": [], + "source": [ + "# get the status of the reserved LB (as a check)\n", + "command = f\"{lbadm} --status -u '{instance_uri}'\"\n", + "execute_commands(sender, [command])" + ] + }, + { + "cell_type": "markdown", + "id": "95e0c89c-73e0-4d2c-a963-9ba405ba255b", + "metadata": {}, + "source": [ + "### Run a test with real load balancer and single sender node and single receiver node" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a0e762e-3945-4b74-8a88-91d49b16eaee", + "metadata": {}, + "outputs": [], + "source": [ + "# select sender and receiver\n", + "sender = list(filter(lambda n: n.get_name()[0:6] == \"Sender\", slice.get_nodes()))[0]\n", + "recver = list(filter(lambda n: n.get_name()[0:7] == \"Worker1\", slice.get_nodes()))[0]\n", + "\n", + "sender_addr = sender.get_interface(network_name=make_net_name(sender.get_site())).get_ip_addr()\n", + "recver_addr = recver.get_interface(network_name=make_net_name(recver.get_site())).get_ip_addr()\n", + "print(f\"Sender sending from {sender_addr}, receiver receiving on {recver_addr}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a206ef22-bb58-48ed-a1e8-9673e0a7d3b5", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# run sender and receiver through the reserved LB\n", + "recverDuration = 40\n", + "mtu = 9000\n", + "rate = 15 # Gbps\n", + "length = 1000000 # event length in bytes\n", + "numEvents = 20000 # number of events to send\n", + "bufSize = 300 * 1024 * 1024 # 100MB send and receive buffers\n", + "recvThreads = 4\n", + "\n", + "# Given that in FABRIC ejfat-lb.es.net resolves to IP6 first and gRPC C++ library doesn't\n", + "# offer granular control over which resolved address is used, we use -4 option to tell the\n", + "# code to use the IPv4 address, but this also disables cert validation.\n", + "recv_command = f\"{e2sar_perf} -r -u '{instance_uri}' -d {recverDuration} -b {bufSize} --ip {recver_addr} --port 10000 --withcp -4 --threads 4\"\n", + "send_command = f\"{e2sar_perf} -s -u '{instance_uri}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize} --ip {sender_addr} --withcp -4\"\n", + "\n", + "# start the receiver for n seconds and log its output\n", + "print(f'Executing command {recv_command} on receiver')\n", + "recver.execute_thread(recv_command, output_file=f\"{recver.get_name()}.perf.log\")\n", + "\n", + "# sleep 5 seconds to let receiver get going\n", + "time.sleep(5)\n", + "\n", + "# start the sender in the foreground\n", + "print(f'Executing command {send_command} on sender')\n", + "stdout_send, stderr_send = sender.execute(send_command, output_file=f\"{sender.get_name()}.perf.log\")\n", + "\n", + "print(f\"Inspect {recver.get_name()}.perf.log file to see the results. It should indicate how many events were received and how many lost.\")" + ] + }, + { + "cell_type": "markdown", + "id": "2842c38c-baac-4ad9-8caa-9fb994a92b6d", + "metadata": {}, + "source": [ + "### Run the test with a single sender node and multiple receiver nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7477f90a-864b-43ef-ab66-16c379fb26de", + "metadata": {}, + "outputs": [], + "source": [ + "# select sender and receivers\n", + "sender = list(filter(lambda n: n.get_name()[0:6] == \"Sender\", slice.get_nodes()))[0]\n", + "\n", + "worker_index = 1\n", + "recvers = list()\n", + "recver_addrs = list()\n", + "while True:\n", + " matches = list(filter(lambda n: n.get_name()[0:7] == f\"Worker{worker_index}\", slice.get_nodes()))\n", + " if len(matches) == 0:\n", + " break\n", + " recver = matches[0]\n", + " recvers.append(recver)\n", + " recver_addr = recver.get_interface(network_name=make_net_name(recver.get_site())).get_ip_addr()\n", + " recver_addrs.append(recver_addr)\n", + " worker_index += 1\n", + " \n", + "sender_addr = sender.get_interface(network_name=make_net_name(sender.get_site())).get_ip_addr()\n", + "print(f\"Sender sending from {sender_addr}, receivers receiving on:\")\n", + "for recver, recver_addr in zip(recvers, recver_addrs):\n", + " print(\"\\t\" + recver.get_name() + \": \" + recver_addr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8266a685-437f-4e13-8603-1bd550e900c4", + "metadata": {}, + "outputs": [], + "source": [ + "# run sender and receivers through the reserved LB\n", + "recverDuration = 60\n", + "mtu = 9000\n", + "rate = 15 # Gbps\n", + "length = 1000000 # event length in bytes\n", + "numEvents = 20000 # number of events to send\n", + "bufSize = 300 * 1024 * 1024 # 100MB send and receive buffers\n", + "recvThreads = 4 # on each receiver\n", + "numSocks = 4 # number send sockets in sender\n", + "startPort = 10000 # receiver starting port\n", + "\n", + "# Given that in FABRIC ejfat-lb.es.net resolves to IP6 first and gRPC C++ library doesn't\n", + "# offer granular control over which resolved address is used, we use -4 option to tell the\n", + "# code to use the IPv4 address, but this also disables cert validation.\n", + "send_command = f\"{e2sar_perf} -s -u '{instance_uri}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize} --ip {sender_addr} --sockets {numSocks} --withcp -4\"\n", + "\n", + "for recver, recver_addr in zip(recvers, recver_addrs):\n", + " recv_command = f\"{e2sar_perf} -r -u '{instance_uri}' -d {recverDuration} -b {bufSize} --ip {recver_addr} --port {startPort} --withcp -4 --threads {recvThreads}\"\n", + " print(f'Executing command {recv_command} on receiver {recver.get_name()}')\n", + " recver.execute_thread(recv_command, output_file=f\"{recver.get_name()}.perf.log\")\n", + "\n", + "# sleep 5 seconds to let receivers get going\n", + "time.sleep(5)\n", + "\n", + "# start the sender in the foreground\n", + "print(f'Executing command {send_command} on sender')\n", + "stdout_send, stderr_send = sender.execute(send_command, output_file=f\"{sender.get_name()}.perf.log\")\n", + "\n", + "print(f\"Inspect WorkerX.perf.log files to see the results. They should indicate how many events were received and how many lost.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41716952-8d5c-4eae-a843-ef5ac0c3ff8a", + "metadata": {}, + "outputs": [], + "source": [ + "# free the load balancer\n", + "command = f\"{lbadm} --free -u '{instance_uri}'\"\n", + "\n", + "execute_commands(sender, [command])" + ] + }, + { + "cell_type": "markdown", + "id": "56c8d476-689a-4c4c-b5dc-ed0625410abc", + "metadata": {}, + "source": [ + "## Manage the slice\n", + "\n", + "### Extend by two weeks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbf9fe4f-b067-4d2c-94a5-9839d5a53b73", + "metadata": {}, + "outputs": [], + "source": [ + "# Set end host to now plus 14 days\n", + "end_date = (datetime.now(timezone.utc) + timedelta(days=14)).strftime(\"%Y-%m-%d %H:%M:%S %z\")\n", + "\n", + "try:\n", + " slice = fablib.get_slice(name=slice_name)\n", + "\n", + " slice.renew(end_date)\n", + "except Exception as e:\n", + " print(f\"Exception: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "38eb476a-2b17-4064-9a0e-a7a98d865804", + "metadata": {}, + "source": [ + "### Delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de3546bf-a013-4eaa-bab1-fb52f7d36c55", + "metadata": {}, + "outputs": [], + "source": [ + "slice = fablib.get_slice(slice_name)\n", + "slice.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1737f112-95d3-4a78-9b15-f16ae98e7f12", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/notebooks/EJFAT/E2SAR-release-tester.ipynb b/scripts/notebooks/EJFAT/E2SAR-release-tester.ipynb index 130cc076..885184f3 100644 --- a/scripts/notebooks/EJFAT/E2SAR-release-tester.ipynb +++ b/scripts/notebooks/EJFAT/E2SAR-release-tester.ipynb @@ -567,6 +567,32 @@ "stdout, stderr = sender.execute(f\"sudo ping -f -s 8972 -c 10 -M do {recver_addr}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2cb3e7e-902d-4971-8f37-16e74924a6c0", + "metadata": {}, + "outputs": [], + "source": [ + "# We need to setup the firewall to allow traffic to pass to the receiver\n", + "\n", + "mgmt_iface_name = get_management_os_interface(recver)\n", + "data_iface = recver.get_interface(network_name=net_name)\n", + "data_iface_name = data_iface.get_os_interface()\n", + "\n", + "print(f'Adding {mgmt_iface_name} and lo and data interface to trusted zone')\n", + "commands = [\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface={data_iface_name}',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface=lo',\n", + " f'sudo firewall-cmd --permanent --zone=trusted --add-interface={mgmt_iface_name}',\n", + " f'for i in $(sudo firewall-cmd --zone=public --list-services); do sudo firewall-cmd --zone=public --permanent --remove-service=$i; done',\n", + "]\n", + "commands.append(f'sudo firewall-cmd --reload')\n", + "commands.append(f'sudo firewall-cmd --list-all --zone=public')\n", + "\n", + "execute_commands([recver], commands)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -586,7 +612,7 @@ "numEvents = 10000 # number of events to send\n", "bufSize = 300 * 1024 * 1024 # 100MB send and receive buffers\n", "\n", - "recv_command = f\"LD_LIBRARY_PATH=/usr/local/lib e2sar_perf -r -u '{e2sarPerfURI}' -d {recverDuration} -b {bufSize}\"\n", + "recv_command = f\"LD_LIBRARY_PATH=/usr/local/lib e2sar_perf -r -u '{e2sarPerfURI}' -d {recverDuration} -b {bufSize} --ip {recver_addr} --port 19522\"\n", "send_command = f\"LD_LIBRARY_PATH=/usr/local/lib e2sar_perf -s -u '{e2sarPerfURI}' --mtu {mtu} --rate {rate} --length {length} -n {numEvents} -b {bufSize}\"\n", "\n", "# start the receiver for 10 seconds and log its output\n", @@ -683,7 +709,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/scripts/notebooks/EJFAT/figs/live-lb.png b/scripts/notebooks/EJFAT/figs/live-lb.png new file mode 100644 index 00000000..2d51b735 Binary files /dev/null and b/scripts/notebooks/EJFAT/figs/live-lb.png differ diff --git a/scripts/notebooks/EJFAT/post-boot/recver.sh b/scripts/notebooks/EJFAT/post-boot/recver.sh index 0eff9e9c..00c91387 100644 --- a/scripts/notebooks/EJFAT/post-boot/recver.sh +++ b/scripts/notebooks/EJFAT/post-boot/recver.sh @@ -22,7 +22,8 @@ if [[ ${distro} == 'ubuntu' ]]; then # install missing software sudo apt-get -yq update - sudo apt-get -yq install python3-pip build-essential autoconf cmake libtool pkg-config libglib2.0-dev ninja-build openssl libssl-dev libsystemd-dev protobuf-compiler libre2-dev gdb docker.io + sudo dpkg -r ufw + sudo apt-get -yq install python3-pip build-essential autoconf cmake libtool pkg-config libglib2.0-dev ninja-build openssl libssl-dev libsystemd-dev protobuf-compiler libre2-dev gdb docker.io firewalld # install meson pip3 install --user meson pybind11 diff --git a/scripts/notebooks/EJFAT/post-boot/sender.sh b/scripts/notebooks/EJFAT/post-boot/sender.sh index e60b8a80..f2e88898 100644 --- a/scripts/notebooks/EJFAT/post-boot/sender.sh +++ b/scripts/notebooks/EJFAT/post-boot/sender.sh @@ -22,7 +22,8 @@ if [[ ${distro} == 'ubuntu' ]]; then # install missing software sudo apt-get -yq update - sudo apt-get -yq install python3-pip build-essential autoconf cmake libtool pkg-config libglib2.0-dev ninja-build openssl libssl-dev libsystemd-dev protobuf-compiler libre2-dev gdb docker.io + sudo dpkg -r ufw + sudo apt-get -yq install python3-pip build-essential autoconf cmake libtool pkg-config libglib2.0-dev ninja-build openssl libssl-dev libsystemd-dev protobuf-compiler libre2-dev gdb docker.io firewalld # install meson pip3 install --user meson pybind11 diff --git a/scripts/notebooks/JIRIAF/FABRIC_JIRIAF.ipynb b/scripts/notebooks/JIRIAF/FABRIC_JIRIAF.ipynb index 5abd4fd7..17b8c2d4 100644 --- a/scripts/notebooks/JIRIAF/FABRIC_JIRIAF.ipynb +++ b/scripts/notebooks/JIRIAF/FABRIC_JIRIAF.ipynb @@ -578,7 +578,7 @@ " # make sure the interface is UP (in rare cases comes up in DOWN state)\n", " node_iface = node.get_interface(network_name = site_net.get_name())\n", " execute_single_node(node, [f'sudo ip link set {node_iface.get_os_interface()} up'])\n", - " #node_iface.ip_addr_add(addr=addr, subnet=site_net.get_subnet())\n" + " node_iface.ip_addr_add(addr=addr, subnet=site_net.get_subnet())\n" ] }, { @@ -658,6 +658,9 @@ " for subnet, portlist in open_ports.items():\n", " for port in portlist:\n", " commands.append(f'sudo firewall-cmd --permanent --zone=public --add-rich-rule=\\'rule family=\\\"ipv4\\\" source address=\\\"{subnet}\\\" port protocol=\\\"tcp\\\" port=\\\"{port}\\\" accept\\'')\n", + " \n", + " for subnet in external_subnets:\n", + " commands.append(f'sudo firewall-cmd --permanent --zone=public --add-rich-rule=\\'rule family=\\\"ipv4\\\" source address=\\\"{subnet}\\\" protocol value=\\\"udp\\\" accept\\'')\n", " commands.append(f'sudo firewall-cmd --reload')\n", " commands.append(f'sudo firewall-cmd --list-all --zone=public')\n", " execute_single_node(node, commands)\n", diff --git a/scripts/notebooks/pybind11_examples/example_ControlPlane.ipynb b/scripts/notebooks/pybind11_examples/example_ControlPlane.ipynb index 6eb45bab..c123da96 100644 --- a/scripts/notebooks/pybind11_examples/example_ControlPlane.ipynb +++ b/scripts/notebooks/pybind11_examples/example_ControlPlane.ipynb @@ -15,24 +15,16 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1.1\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "\n", "## IMPORTANT: Update the path to your built Python module\n", "sys.path.append(\n", - " '/home/ubuntu/E2SAR/build/src/pybind')\n", + " '/home/ubuntu/dev-e2sar/build/src/pybind')\n", "\n", "import e2sar_py\n", - "print(e2sar_py.get_version())" + "# print(e2sar_py.get_version())" ] }, { @@ -57,7 +49,7 @@ "source": [ "#### \"Timestamp\" class\n", "\n", - "The Python `e2sar_py.Timestamp` class acts as a bridge to the C++ `google::protobuf::Timestamp` class. The following code demonstrates how to use it." + "The Python `e2sar_py.ControlPlane.Timestamp` class serves as a bridge to the C++ `google::protobuf::Timestamp` class. The following code illustrates how to use it. Note that the object is printed as a string." ] }, { @@ -72,7 +64,7 @@ "Defaulting to user installation because normal site-packages is not writeable\n", "Requirement already satisfied: protobuf in /home/ubuntu/.local/lib/python3.10/site-packages (5.28.1)\n", "Note: you may need to restart the kernel to use updated packages.\n", - "Timestamp: seconds = 1726512856, nanos = 809628000\n" + "Timestamp: 2024-10-02T04:14:25.698545Z, seconds = 1727842465, nanos = 698545000\n" ] } ], @@ -85,7 +77,7 @@ "from e2sar_py.ControlPlane import Timestamp\n", "\n", "\n", - "def get_timestamp_from_gts() -> e2sar_py.ControlPlane.Timestamp:\n", + "def get_currtimestamp_from_gts() -> e2sar_py.ControlPlane.Timestamp:\n", " g_ts = gts()\n", " g_ts.GetCurrentTime()\n", " curr_ts = Timestamp()\n", @@ -93,8 +85,8 @@ " curr_ts.set_nanos(g_ts.nanos)\n", " return curr_ts\n", "\n", - "ts = get_timestamp_from_gts()\n", - "print(f\"Timestamp: seconds = {ts.get_seconds()}, nanos = {ts.get_nanos()}\")" + "ts = get_currtimestamp_from_gts()\n", + "print(f\"Timestamp: {ts}, seconds = {ts.get_seconds()}, nanos = {ts.get_nanos()}\")" ] }, { @@ -103,7 +95,7 @@ "source": [ "#### \"LBWorkerStatus\" class\n", "\n", - "The following code block demonstrates how to manipulate the `ControlPlane.LBWorkerStatus` class." + "The following code block demonstrates how to manipulate the `ControlPlane.LBWorkerStatus` class. The `last_updated` attribute is a string." ] }, { @@ -119,10 +111,7 @@ "Worker fill percent: 75.5\n", "Worker control signal: 0.8999999761581421\n", "Worker slots assigned: 10\n", - "Worker last_updated (following 2 lines):\n", - "seconds: 1726512856\n", - "nanos: 809628000\n", - "\n" + "Worker last updated: 2024-10-02T04:14:25.698545Z\n" ] } ], @@ -133,14 +122,17 @@ "print(f\"Worker fill percent: {worker.fill_percent}\")\n", "print(f\"Worker control signal: {worker.control_signal}\")\n", "print(f\"Worker slots assigned: {worker.slots_assigned}\")\n", - "print(f\"Worker last_updated (following 2 lines):\\n{worker.last_updated}\")" + "print(f\"Worker last updated: {worker.last_updated}\")\n", + "\n", + "assert(worker.last_updated == str(ts))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### \"LBStatus\" class" + "#### \"LBStatus\" class\n", + "The attributes `timestamp` and `expiresAt` are bound as Python strings." ] }, { @@ -152,22 +144,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "1234\n", - "['192.168.100.1', '192.168.100.2']\n", - "[]\n" + "Timestamp: 2024-10-02T04:14:25.698545Z\n", + "Expires at: 2024-10-02T05:14:25.698545Z\n" ] } ], "source": [ "ip_list = [\"192.168.100.1\", \"192.168.100.2\"]\n", "\n", + "# Set the expire timestamp\n", + "expire_ts = Timestamp() # DO NOT decalre as \"expire_ts = ts\"\n", + "expire_ts.set_seconds(ts.get_seconds() + 3600) # 1 hr\n", + "expire_ts.set_nanos(ts.get_nanos()) \n", + "\n", "# Create an LBStatus object with empty WorkerStatus list\n", - "status = e2sar_py.ControlPlane.LBStatus(ts, 1234, 5678, [], ip_list, ts)\n", + "status = e2sar_py.ControlPlane.LBStatus(ts, 1234, 5678, [], ip_list, expire_ts)\n", "\n", "# Access members\n", - "print(status.currentEpoch)\n", - "print(status.senderAddresses)\n", - "print(status.workers)" + "assert(status.timestamp == str(ts))\n", + "assert(status.currentEpoch == 1234)\n", + "assert(status.currentPredictedEventNumber == 5678)\n", + "assert(status.senderAddressList == ip_list)\n", + "assert(status.workerStatusList == [])\n", + "assert(status.expiresAt == str(expire_ts))\n", + "\n", + "print(\"Timestamp: \", status.timestamp)\n", + "print(\"Expires at: \", status.expiresAt)" ] }, { @@ -176,15 +178,14 @@ "source": [ "#### \"OverviewEntry\" class\n", "\n", - "The members of this class are read-only. Its usage will be demonstrated in `LBMLiveTest6`." + "The attributes of this class are read-only. Its usage will be demonstrated in `LBMLiveTest6`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## \"LBManager\" class\n", - "Intialize `LBManager`." + "## \"LBManager\" class" ] }, { @@ -200,11 +201,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Tests with a mock EJFAT Load Balancer\n", + "### Examples with a mock EJFAT Load Balancer\n", "\n", - "This section requires a functional (mock) load balancer. Ensure the `udplbd` container is running on the FABRIC `cpnode` before executing the code blocks.\n", + "This section requires a functional (mock) load balancer. If you are launching with a FABRIC slice, ensure the `udplbd` container is running on the FABRIC `cpnode` before executing the code blocks.\n", "\n", - "The Python code blocks below replicate the tests conducted in the C++ `e2sar_lbcp_live_test`." + "The Python code blocks below replicate the tests performed in the C++ `e2sar_lbcp_live_test`." ] }, { @@ -327,7 +328,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Instance token: 9b0a986e06d5983e79e71904a6dba3fdb3aef7065d4b81e639c7ea66fedc2b45\n" + "Instance token: 955d427b68d84c56e0f197dbcb80ff5217b21c7ff13e5d4334ec33dd31fe177f\n" ] } ], @@ -398,7 +399,7 @@ "output_type": "stream", "text": [ "Sync IPv4 addr is: 192.168.0.3\n", - "LB id is: 16\n" + "LB id is: 2\n" ] } ], @@ -554,7 +555,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "LB ID is: 17\n" + "LB ID is: 3\n" ] } ], @@ -677,8 +678,8 @@ "\n", "Send state succeeded\n", "\n", - "Session id is: 9\n", - "Session token is: 53d47321eecfc42cdb7999649c9bc9bd197d39e3b32830e5a3028bc83cfba473\n", + "Session id is: 1\n", + "Session token is: 9f5dbfd36e86f520eea8a808faf79129a709f896df3a1123f8bed664c400c1d4\n", "\n", "Deregister worker succeeded\n" ] @@ -795,7 +796,7 @@ "\n", "Send state for 25 times\n", "\n", - "Get LB status succeeded: \n" + "Get LB status succeeded: \n" ] } ], @@ -840,48 +841,96 @@ "cell_type": "code", "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Worker status:\n", + " name: my_node\n", + " fill_percent: 0.800000011920929\n", + " control_signal: 1.0\n", + " slots_assigned: 512\n", + " last_updated: 2024-10-02T04:18:05.888130752Z\n" + ] + } + ], "source": [ "# Get worker status\n", "workers = lbm.get_worker_statuses(status_res)\n", "\n", - "assert len(workers) == 1" + "assert len(workers) == 1\n", + "\n", + "def print_workerstatus(w : e2sar_py.ControlPlane.WorkerStatus):\n", + " print(f\"Worker status:\")\n", + " print(f\" name: {w.get_name()}\")\n", + " print(f\" fill_percent: {w.get_fill_percent()}\")\n", + " print(f\" control_signal: {w.get_control_signal()}\")\n", + " print(f\" slots_assigned: {w.get_slots_assigned()}\")\n", + " print(f\" last_updated: {w.get_last_updated()}\")\n", + "\n", + "# print a worker's status\n", + "print_workerstatus(workers[0])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, + "outputs": [], + "source": [ + "assert(workers[0].get_name() == \"my_node\")\n", + "\n", + "DELTA = 0.000001\n", + "assert(abs(workers[0].get_fill_percent() - 0.8) < DELTA)\n", + "assert(abs(workers[0].get_control_signal() - 1) < DELTA)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Timestamp: seconds: 1726512860\n", - "nanos: 102211657\n", - "\n", - "expiresAt: seconds: 1726516457\n", - "\n", + "Timestamp: 2024-10-02T04:18:05.990769793Z\n", + "expiresAt: 2024-10-02T05:18:03Z\n", "currentEpoch: 3\n", "currentPredictedEventNumber: 9223372036854775808\n", - "workers: []\n" + "workerStatusList: []\n" ] } ], "source": [ "lb_status = lbm.as_lb_status(status_res)\n", - "assert(lb_status.senderAddresses == ip_list)\n", - "assert(len(lb_status.workers) == 1)\n", + "dir(lb_status)\n", + "assert(lb_status.senderAddressList == ip_list)\n", + "assert(len(lb_status.workerStatusList) == 1)\n", "print(\"Timestamp: \", lb_status.timestamp)\n", "print(\"expiresAt: \", lb_status.expiresAt)\n", "print(\"currentEpoch: \", lb_status.currentEpoch)\n", "print(\"currentPredictedEventNumber: \", lb_status.currentPredictedEventNumber)\n", - "print(\"workers: \", lb_status.workers)\n" + "print(\"workerStatusList: \", lb_status.workerStatusList)\n" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "DELTA = 0.000001\n", + "\n", + "assert(lb_status.workerStatusList[0].get_name() == \"my_node\")\n", + "assert(abs(lb_status.workerStatusList[0].get_fill_percent() - 0.8) < DELTA)\n", + "assert(abs(lb_status.workerStatusList[0].get_control_signal() - 1) < DELTA)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -915,7 +964,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -925,16 +974,16 @@ "Create LBManager obj succeeded: 0\n", "Register worker succeeded\n", "\n", - "Session id is: 11\n", - "Session token is: 07589d480e99cdb41bf26b263e236eae381f0ba33feec5054c62a38a3c41f811\n", + "Session id is: 3\n", + "Session token is: 68cb8d9db7a4b367d4d57572a785c1d68aca22c3331fa52d4fdd1e2304c5c9ce\n", "\n", "Send state for 25 times\n", "Add senders succeeded.\n", "\n", - "Get LB status succeeded: \n", + "Get LB status succeeded: \n", "Remove senders succeeded.\n", "\n", - "Get LB status succeeded: \n", + "Get LB status succeeded: \n", "\n", "Deregister worker succeeded\n", "Free LB succeeded!\n" @@ -1005,6 +1054,9 @@ "assert(res == ip_list)\n", "workers = lbm.get_worker_statuses(status_res)\n", "assert len(workers) == 1\n", + "assert(workers[0].get_name() == \"my_node\")\n", + "assert(abs(workers[0].get_fill_percent() - 0.8) < DELTA)\n", + "assert(abs(workers[0].get_control_signal() - 1) < DELTA)\n", "\n", "# Remove senders\n", "res = lbm.remove_senders(ip_list)\n", @@ -1042,7 +1094,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1076,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -1086,25 +1138,22 @@ "Create LBManager obj succeeded: 0\n", "Register worker succeeded\n", "\n", - "Session id is: 14\n", - "Session token is: 24409feb705a6ae6a6ef7dce008a7c726fa6134bba00e19e47ca6d342d81d162\n", + "Session id is: 4\n", + "Session token is: 8800e7562f0a9a4b78bff20d8a20a9047201faaee9658c65e41e57b8e3fe41af\n", "\n", "Send state for 25 times\n", "\n", - "Get LB status succeeded: \n", + "Get LB status succeeded: \n", "Sender addresses: ['192.168.20.1', '192.168.20.2']\n", "Current Epoch: 2\n", "Current predicted Event No: 9223372036854775808\n", - "Workers: []\n", - "Timestamp: seconds: 1726512975\n", - "nanos: 658369981\n", - "\n", - "expiresAt: seconds: 1726516573\n", - "\n", + "Workers: []\n", + "Timestamp: 2024-10-02T04:18:11.402769466Z\n", + "expiresAt: 2024-10-02T05:18:08Z\n", "\n", "Send state for 25 times\n", "\n", - "LB id: 23\n", + "LB id: 7\n", "1\n" ] } @@ -1171,15 +1220,17 @@ "\n", "# as_lb_status() usage\n", "res = lbm.as_lb_status(status_res)\n", - "print(f\"Sender addresses: {res.senderAddresses}\")\n", + "print(f\"Sender addresses: {res.senderAddressList}\")\n", "print(f\"Current Epoch: {res.currentEpoch}\")\n", "print(f\"Current predicted Event No: {res.currentPredictedEventNumber}\")\n", - "print(f\"Workers: {res.workers}\")\n", + "print(f\"Workers: {res.workerStatusList}\")\n", "print(f\"Timestamp: {res.timestamp}\")\n", "print(f\"expiresAt: {res.expiresAt}\")\n", - "assert(res.senderAddresses == ip_list)\n", - "# print(res.workers, dir(res.workers))\n", - "assert(len(res.workers) == 1)\n", + "assert(res.senderAddressList == ip_list)\n", + "assert(len(res.workerStatusList) == 1)\n", + "assert(res.workerStatusList[0].get_name() == \"my_node\")\n", + "assert(abs(res.workerStatusList[0].get_fill_percent() - 0.8) < DELTA)\n", + "assert(abs(res.workerStatusList[0].get_control_signal() - 1) < DELTA)\n", "\n", "print(\"\\nSend state for 25 times\")\n", "for i in range(25):\n", @@ -1203,13 +1254,13 @@ " free_lbmanager(lbm)\n", "print(\"LB id: \", res[0].lb_id)\n", "assert(res[0].name == \"mylb\")\n", - "assert(res[0].lb_status.senderAddresses == ip_list)\n", - "print(len(res[0].lb_status.workers))" + "assert(res[0].lb_status.senderAddressList == ip_list)\n", + "print(len(res[0].lb_status.workerStatusList))" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 38, "metadata": {}, "outputs": [ { diff --git a/scripts/notebooks/pybind11_examples/example_DataPlane.ipynb b/scripts/notebooks/pybind11_examples/example_DataPlane.ipynb index 79b135a2..ab1ce69c 100644 --- a/scripts/notebooks/pybind11_examples/example_DataPlane.ipynb +++ b/scripts/notebooks/pybind11_examples/example_DataPlane.ipynb @@ -20,7 +20,7 @@ "\n", "## IMPORTANT: Update the path to your built Python module. Use the absolute path to make life easier.\n", "sys.path.append(\n", - " '/home/ubuntu/E2SAR/build/src/pybind')\n", + " '/home/ubuntu/dev-e2sar/build/src/pybind')\n", "\n", "import e2sar_py" ] @@ -43,9 +43,10 @@ "metadata": {}, "outputs": [], "source": [ - "DP_IPV4 = \"127.0.0.1\"\n", - "data_id = 0x0505 # decimal value: 1085\n", - "eventSrc_id = 0x11223344 # decimal value: 287454020" + "DP_IPV4_ADDR = \"127.0.0.1\"\n", + "DP_IPV4_PORT = 10000\n", + "DATA_ID = 0x0505 # decimal value: 1085\n", + "EVENTSRC_ID = 0x11223344 # decimal value: 287454020" ] }, { @@ -79,7 +80,7 @@ ], "source": [ "# Set up URI for segmenter\n", - "SEG_URI = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4}\"\n", + "SEG_URI = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4_ADDR}:{DP_IPV4_PORT}\"\n", "seg_uri = e2sar_py.EjfatURI(uri=SEG_URI, tt=e2sar_py.EjfatURI.TokenType.instance)\n", "\n", "# Set up sflags\n", @@ -106,7 +107,7 @@ "outputs": [], "source": [ "# Init segmenter object\n", - "seg = e2sar_py.DataPlane.Segmenter(seg_uri, data_id, eventSrc_id, sflags)" + "seg = e2sar_py.DataPlane.Segmenter(seg_uri, DATA_ID, EVENTSRC_ID, sflags)" ] }, { @@ -163,8 +164,7 @@ " epoch_ms = 1000\n", " setPoint = 0.0\n", " [Kp, Ki, Kd] = [0.0, 0.0, 0.0]\n", - " dpV6 = False\n", - " cpV6 = False\n", + " [weight, min_factor, max_factor] = [1.0, 0.5, 2.0]\n", " portRange = -1\n", " withLBHeader = True\n" ] @@ -172,7 +172,7 @@ ], "source": [ "# Set the reassembler URI\n", - "REAS_URI_ = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4}\"\n", + "REAS_URI_ = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4_ADDR}\"\n", "reas_uri = e2sar_py.EjfatURI(uri=REAS_URI_, tt=e2sar_py.EjfatURI.TokenType.instance)\n", "\n", "# Make sure the token matches the one in the string\n", @@ -204,14 +204,9 @@ "print(f\" epoch_ms = {rflags.epoch_ms}\")\n", "print(f\" setPoint = {rflags.setPoint}\")\n", "print(f\" [Kp, Ki, Kd] = [{rflags.Kp}, {rflags.Ki}, {rflags.Kd}]\")\n", - "print(f\" dpV6 = {rflags.dpV6}\")\n", - "print(f\" cpV6 = {rflags.cpV6}\")\n", + "print(f\" [weight, min_factor, max_factor] = [{rflags.weight}, {rflags.min_factor}, {rflags.max_factor}]\")\n", "print(f\" portRange = {rflags.portRange}\")\n", - "print(f\" withLBHeader = {rflags.withLBHeader}\")\n", - "\n", - "# Init the reassembler object\n", - "# print(type(reas_uri), type(1), type(rflags))\n", - "reas = e2sar_py.DataPlane.Reassembler(reas_uri, 1, rflags)\n" + "print(f\" withLBHeader = {rflags.withLBHeader}\")" ] }, { @@ -219,6 +214,17 @@ "execution_count": 8, "metadata": {}, "outputs": [], + "source": [ + "# Init the reassembler object\n", + "reas = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), DP_IPV4_PORT, 1, rflags)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], "source": [ "\n", "res = reas.OpenAndStart() # the DP address must be available\n", @@ -227,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -252,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -308,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -357,12 +363,56 @@ " continue\n", " recv = recv_bytes_list[0].decode('utf-8')\n", " print(f\" recv_buf:\\t {recv}\")\n", - " assert(recv_data_id == data_id)\n", + " assert(recv_data_id == DATA_ID)\n", " print(f\" bufLen:\\t {recv_event_len}\")\n", " print(f\" eventNum:\\t {recv_event_num}\")\n", " print(f\" dataId:\\t {recv_data_id}\")\n" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "reas_status = reas.getStats()\n", + "assert(reas_status[0] == 0) # no losses\n", + "assert(reas_status[1] == 5) # all succeeded\n", + "assert(reas_status[2] == 0) # no errno\n", + "assert(reas_status[3] == 0) # no grpc errors\n", + "assert(reas_status[4] == 0) # no data errors\n", + "assert(reas_status[5] == e2sar_py.E2SARErrorc.NoError) # no error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if there is any lost event." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NO EVENT LOSS\n" + ] + } + ], + "source": [ + "if reas.get_LostEvent().has_error():\n", + " print(\"NO EVENT LOSS\")\n", + "else:\n", + " print(f\"LOST EVENT: ({reas.get_LostEvent().value()[0]}, {reas.get_LostEvent().value()[1]})\")\n", + "\n", + "assert(reas.get_LostEvent().error().code == e2sar_py.E2SARErrorc.NotFound)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -374,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -411,22 +461,22 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "SEG_URI2 = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4}\"\n", + "SEG_URI2 = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4_ADDR}:{DP_IPV4_PORT}\"\n", "seg_uri2 = e2sar_py.EjfatURI(uri=SEG_URI2, tt=e2sar_py.EjfatURI.TokenType.instance)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Initialize a new Segmenter objects\n", - "seg2 = e2sar_py.DataPlane.Segmenter(seg_uri2, data_id, eventSrc_id, sflags2)\n", + "seg2 = e2sar_py.DataPlane.Segmenter(seg_uri2, DATA_ID, EVENTSRC_ID, sflags2)\n", "\n", "res = seg2.OpenAndStart()\n", "assert(res.value() == 0)\n", @@ -439,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -480,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -528,12 +578,47 @@ " continue\n", " recv = recv_bytes_list[0].decode('utf-8')\n", " print(f\" recv_buf:\\t {recv}\")\n", - " assert(recv_data_id == data_id)\n", + " assert(recv_data_id == DATA_ID)\n", " print(f\" bufLen:\\t {recv_event_len}\")\n", " print(f\" eventNum:\\t {recv_event_num}\")\n", " print(f\" dataId:\\t {recv_data_id}\")" ] }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 10, 0, 0, 0, )\n", + "NO EVENT LOSS\n" + ] + } + ], + "source": [ + "# Validation\n", + "reas_status = reas.getStats()\n", + "print(reas_status)\n", + "\n", + "assert(reas_status[0] == 0) # no losses\n", + "# assert(reas_status[1] == 5) # NOTE: this number is accumulated. So it's not always 5. \n", + "assert(reas_status[2] == 0) # no errno\n", + "assert(reas_status[3] == 0) # no grpc errors\n", + "assert(reas_status[4] == 0) # no data errors\n", + "assert(reas_status[5] == e2sar_py.E2SARErrorc.NoError) # no error\n", + "\n", + "\n", + "if reas.get_LostEvent().has_error():\n", + " print(\"NO EVENT LOSS\")\n", + "else:\n", + " print(f\"LOST EVENT: ({reas.get_LostEvent().value()[0]}, {reas.get_LostEvent().value()[1]})\")\n", + "\n", + "assert(reas.get_LostEvent().error().code == e2sar_py.E2SARErrorc.NotFound)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -545,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -563,7 +648,8 @@ "# Create reassembler with 1 recv thread\n", "\n", "rflags_m = e2sar_py.DataPlane.ReassemblerFlags()\n", - "reas_a = e2sar_py.DataPlane.Reassembler(reas_uri, 1, rflags_m)\n", + "reas_a = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 1, rflags_m)\n", "\n", "print(\"This reassembler has\")\n", "print(f\" {reas_a.get_numRecvThreads()} threads;\")\n", @@ -577,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -593,7 +679,8 @@ ], "source": [ "# Create reassembler with 4 recv threads\n", - "reas_b = e2sar_py.DataPlane.Reassembler(reas_uri, 4, rflags_m)\n", + "reas_b = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 4, rflags_m)\n", "\n", "print(\"This reassembler has\")\n", "print(f\" {reas_b.get_numRecvThreads()} threads;\")\n", @@ -607,7 +694,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -623,7 +710,8 @@ ], "source": [ "# Create reassembler with 7 recv threads\n", - "reas_c = e2sar_py.DataPlane.Reassembler(reas_uri, 7, rflags_m)\n", + "reas_c = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 7, rflags_m)\n", "\n", "print(\"This reassembler has\")\n", "print(f\" {reas_c.get_numRecvThreads()} threads;\")\n", @@ -637,7 +725,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -655,7 +743,8 @@ "# 4 threads with portRange override\n", "rflags_m.portRange = 10\n", "\n", - "reas_d = e2sar_py.DataPlane.Reassembler(reas_uri, 4, rflags_m)\n", + "reas_d = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 4, rflags_m)\n", "\n", "print(\"This reassembler has\")\n", "print(f\" {reas_d.get_numRecvThreads()} threads;\")\n", @@ -669,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -687,7 +776,8 @@ "# 4 threads with low portRange override\n", "rflags_m.portRange = 1\n", "\n", - "reas_e = e2sar_py.DataPlane.Reassembler(reas_uri, 4, rflags_m)\n", + "reas_e = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 4, rflags_m)\n", "\n", "print(\"This reassembler has\")\n", "print(f\" {reas_e.get_numRecvThreads()} threads;\")\n", @@ -710,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -731,7 +821,7 @@ "source": [ "# Create 4 segmenter objects\n", "\n", - "SEG_URI_BASE = SEG_URI\n", + "SEG_URI_BASE = f\"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data={DP_IPV4_ADDR}\"\n", "SEG_URI_PORT_BASE = 19522\n", "\n", "sflags = e2sar_py.DataPlane.SegmenterFlags()\n", @@ -755,14 +845,14 @@ "seg_obj_list = []\n", "for i in range(4):\n", " seg_obj_list.append(create_seg_obj(\n", - " i, data_id, eventSrc_id, sflags))\n", + " i, DATA_ID, EVENTSRC_ID, sflags))\n", "assert len(seg_obj_list) == 4\n", "\n" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -784,7 +874,8 @@ "rflags.withLBHeader = True # LB header will be attached since there is no LB\n", "rflags.portRange = 2 # use 2^portRange=4 ports\n", "\n", - "reas = e2sar_py.DataPlane.Reassembler(reas_uri, 1, rflags)\n", + "reas = e2sar_py.DataPlane.Reassembler(\n", + " reas_uri, e2sar_py.IPAddress.from_string(DP_IPV4_ADDR), 19522, 1, rflags)\n", "print(\"This reassembler has\")\n", "print(f\" {reas.get_numRecvThreads()} threads;\")\n", "print(f\" listening on ports {reas.get_recvPorts()[0]}:{reas.get_recvPorts()[1]};\")\n", @@ -793,7 +884,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -819,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -832,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -845,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -893,7 +984,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1027,6 +1118,135 @@ "assert(res[0] == 0) # no losses\n", "# assert(res[1] == 20) # hold for the 1st try" ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NO EVENT LOSS\n" + ] + } + ], + "source": [ + "# Validation\n", + "if reas.get_LostEvent().has_error():\n", + " print(\"NO EVENT LOSS\")\n", + "else:\n", + " print(f\"LOST EVENT: ({reas.get_LostEvent().value()[0]}, {reas.get_LostEvent().value()[1]})\")\n", + "\n", + "assert(reas.get_LostEvent().error().code == e2sar_py.E2SARErrorc.NotFound)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DPReasTest5\n", + "\n", + "Examples of reading SegmenterFlags from INI files." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Init ReassemblerFlag successfully!\n" + ] + } + ], + "source": [ + "# Reassembler\n", + "res_flag = e2sar_py.DataPlane.ReassemblerFlags.getFromINI(\n", + " '/home/ubuntu/dev-e2sar/reassembler_config.ini'\n", + ")\n", + "\n", + "if res_flag.has_error():\n", + " print(f\"Read from ini file failed: {res_flag.error()}\")\n", + "else:\n", + " print(\"Init ReassemblerFlag successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reassembler flags:\n", + " period_ms=100\n", + " useCP=True\n", + " validateCert = True\n", + " epoch_ms = 1000\n", + " setPoint = 0.0\n", + " [Kp, Ki, Kd] = [0.0, 0.0, 2.0]\n", + " [weight, min_factor, max_factor] = [1.0, 0.5, 2.0]\n", + " portRange = -1\n", + " withLBHeader = False\n" + ] + } + ], + "source": [ + "# Validation\n", + "rflags = res_flag.value()\n", + "\n", + "assert(rflags.useCP == True)\n", + "assert(rflags.validateCert == True)\n", + "assert(rflags.portRange == -1)\n", + "\n", + "print(\"Reassembler flags:\")\n", + "print(f\" period_ms={rflags.period_ms}\") # should be 100 according to the C++ constructor\n", + "print(f\" useCP={rflags.useCP}\")\n", + "print(f\" validateCert = {rflags.validateCert}\")\n", + "print(f\" epoch_ms = {rflags.epoch_ms}\")\n", + "print(f\" setPoint = {rflags.setPoint}\")\n", + "print(f\" [Kp, Ki, Kd] = [{rflags.Kp}, {rflags.Ki}, {rflags.Kd}]\")\n", + "print(f\" [weight, min_factor, max_factor] = [{rflags.weight}, {rflags.min_factor}, {rflags.max_factor}]\")\n", + "print(f\" portRange = {rflags.portRange}\")\n", + "print(f\" withLBHeader = {rflags.withLBHeader}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Init SegmenterFlag successfully!\n" + ] + } + ], + "source": [ + "# Segmenter\n", + "res_flag = e2sar_py.DataPlane.SegmenterFlags.getFromINI(\n", + " '/home/ubuntu/dev-e2sar/segmenter_config.ini'\n", + ")\n", + "\n", + "if res_flag.has_error():\n", + " print(f\"Read from ini file failed: {res_flag.error()}\")\n", + "else:\n", + " print(\"Init SegmenterFlag successfully!\")\n", + "\n", + "sflags = res_flag.value()\n", + "assert(sflags.useCP == True)\n", + "assert(sflags.syncPeriodMs == 1000)\n", + "assert(sflags.syncPeriods == 2)" + ] } ], "metadata": { diff --git a/scripts/notebooks/pybind11_examples/example_EjfatURI.ipynb b/scripts/notebooks/pybind11_examples/example_EjfatURI.ipynb index 2e51157c..745fb4b6 100644 --- a/scripts/notebooks/pybind11_examples/example_EjfatURI.ipynb +++ b/scripts/notebooks/pybind11_examples/example_EjfatURI.ipynb @@ -13,25 +13,17 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1.1\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "\n", "# IMPORTANT: Adjust the path to your built Python module as necessary\n", "sys.path.append(\n", - " '/home/ubuntu/E2SAR/build/src/pybind')\n", + " '/home/ubuntu/dev-e2sar/build/src/pybind')\n", "\n", "import e2sar_py\n", "\n", - "print(e2sar_py.get_version())" + "# print(e2sar_py.get_version())" ] }, { @@ -279,7 +271,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\n", "Instance Token: token123\n" ] } diff --git a/scripts/notebooks/pybind11_examples/example_Headers.ipynb b/scripts/notebooks/pybind11_examples/example_Headers.ipynb index f91a9015..7105219a 100644 --- a/scripts/notebooks/pybind11_examples/example_Headers.ipynb +++ b/scripts/notebooks/pybind11_examples/example_Headers.ipynb @@ -11,47 +11,27 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1.1\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "\n", "## IMPORTANT: Update the path to your built Python module. Prefer absolute path.\n", "sys.path.append(\n", - " '/home/ubuntu/E2SAR/build/src/pybind')\n", + " '/home/ubuntu/dev-e2sar/build/src/pybind')\n", "\n", "import e2sar_py\n", "\n", "# Get the version\n", - "print(e2sar_py.get_version())\n" + "# print(e2sar_py.get_version())\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Default data plane port: 19522\n", - "Default Reassembler Header version: 1\n", - "Default Reassembler Header nibble: 16\n", - "Default Load Balancer Header version: 2\n", - "Default Sync Header version: 1\n" - ] - } - ], + "outputs": [], "source": [ "# Print the constant atrributes\n", "print(f\"Default data plane port: {e2sar_py._dp_port}\")\n", @@ -63,20 +43,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "IP header length: 20\n", - "UDP header length: 8\n", - "Total header length: 64\n", - "LB + RE header length: 36\n" - ] - } - ], + "outputs": [], "source": [ "\n", "# The Hdr lengths\n", @@ -97,22 +66,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before setting fields: (0, 0, 0, 0)\n", - "After setting fields: (1, 2, 4, 8)\n", - " data_id=1\n", - " buff_off=2\n", - " buff_len=4\n", - " event_num=8\n" - ] - } - ], + "outputs": [], "source": [ "# Setting the Reaasembler header fields\n", "rehdr = e2sar_py.REHdr()\n", @@ -140,22 +96,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before (2, 1, 0, 0)\n", - "2\n", - "1\n", - "0\n", - "0\n", - "After (2, 1, 200, 50)\n" - ] - } - ], + "outputs": [], "source": [ "# Load balancer header\n", "lbhdr = e2sar_py.LBHdr()\n", diff --git a/scripts/scapy/snifgen.py b/scripts/scapy/snifgen.py index f0053a85..a61e7fde 100755 --- a/scripts/scapy/snifgen.py +++ b/scripts/scapy/snifgen.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# this is a script for sniffing and generating various UDP packet formats withing EJ-FAT: +# this is a script for sniffing, parsing PCAP and generating various UDP packet formats within EJ-FAT: # - sync packets # - data packets with LB+RE header (pre-load-balancer) # - data packets with RE header only (post-load-balancer) @@ -12,16 +12,19 @@ from typing import List +from datetime import datetime + from scapy.all import * from scapy.packet import Packet, bind_layers from scapy.fields import ShortField, StrLenField + # Sync header class SyncPacket(Packet): name = "SyncPacket" fields_desc = [ StrFixedLenField('preamble', 'LC', 2), - XByteField('version', 1), + XByteField('version', 2), XByteField('reserved', 0), IntField('eventSrcId', 0), LongField('eventNumber', 0), @@ -43,6 +46,20 @@ class LBPacket(Packet): ] LBPacketLength = 2 + 1 + 1 + 2 + 2 + 8 +class TruncatedStrLenField(StrLenField): + __slots__ = ["trunc"] + + def __init__(self, name, default, length_from, truncate_to=10): + # Add a truncate_to parameter to control the number of characters to show + super().__init__(name, default, length_from=length_from) + self.trunc = truncate_to + + def i2repr(self, pkt, x): + if x is None: + return "" + # Truncate the string to `self.truncate_to` characters for display + return repr(x[:self.trunc].decode("utf-8") + ("..." if len(x) > self.trunc else "")) + # RE header itself class REPacket(Packet): name = "REPacket" @@ -54,7 +71,7 @@ class REPacket(Packet): IntField('bufferOffset', 0), IntField('bufferLength', 0), # note this is Event Length LongField('eventNumber', 0), - StrLenField('pld', '', length_from=lambda p: p.bufferLength) + TruncatedStrLenField('pld', '', length_from=lambda p: p.bufferLength, truncate_to=20) ] REPacketLength = 1 + 1 + 2 + 4 + 4 + 8 @@ -68,9 +85,13 @@ def bind_lb_hdr(dport): bind_layers(UDP, LBPacket, dport=dport) bind_layers(LBPacket, REPacket) -# bind RE header to specified port +# bind RE header to specified ports def bind_re_hdr(dport): - bind_layers(UDP, REPacket, dport=dport) + if (isinstance(dport, list)): + for p in dport: + bind_layers(UDP, REPacket, dport=p) + else: + bind_layers(UDP, REPacket, dport=dport) # generate a packet with specific field values def genSyncPkt(ip_addr: str, udp_dport: int, eventSrcId: int, eventNumber: int, avgEventRateHz: int) -> Packet: @@ -85,7 +106,7 @@ def genLBREPkt(ip_addr: str, udp_port: int, entropy: int, dataId: int, eventNumb max_segment_len = mtu - LBPacketLength - REPacketLength if max_segment_len < 0: print(f'MTU of {mtu} too short to accommodate LB header {LBPacketLength} and RE header {REPacketLength}') - sys.exit() + sys.exit(-1) segment_offset = 0 segment_end = len(payload) if len(payload) < max_segment_len else max_segment_len while segment_offset < len(payload): @@ -108,7 +129,7 @@ def genREPkt(ip_addr: str, udp_port: int, dataId: int, eventNumber: int, payload max_segment_len = mtu - REPacketLength if max_segment_len < 0: print(f'MTU of {mtu} too short to accommodate RE header {REPacketLength}') - sys.exit() + sys.exit(-1) segment_offset = 0 segment_end = len(payload) if len(payload) < max_segment_len else max_segment_len while segment_offset < len(payload): @@ -127,8 +148,8 @@ def genREPkt(ip_addr: str, udp_port: int, dataId: int, eventNumber: int, payload def validate_sync_packet(packet): if not (packet.preamble == b'LC'): return False, f"Preamble must be 'LC' instead of {packet.preamble}" - if not (packet.version == 1): - return False, f"Expected version number 1, not {packet.version}" + if not (packet.version == 2): + return False, f"Expected version number 2, not {packet.version}" return True, "" # validate LB packet @@ -149,6 +170,9 @@ def validate_re_packet(packet): # generic callback for all headers def packet_callback(packet): + if Raw in packet: + del packet[Raw] + if SyncPacket in packet: valid, error_msg = validate_sync_packet(packet[SyncPacket]) if valid: @@ -158,14 +182,13 @@ def packet_callback(packet): elif LBPacket in packet: valid, error_msg = validate_lb_packet(packet[LBPacket]) if valid: - packet[IP].show() - else: - print(f"LB packet validation error: {error_msg}") - valid, error_msg = validate_re_packet(packet[REPacket]) - if valid: - packet[IP].show() + valid, error_msg = validate_re_packet(packet[REPacket]) + if valid: + packet[IP].show() + else: + print(f"RE packet validation error: {error_msg}") else: - print(f"RE packet validation error: {error_msg}") + print(f"LB packet validation error: {error_msg}") elif REPacket in packet: valid, error_msg = validate_re_packet(packet[REPacket]) if valid: @@ -181,43 +204,66 @@ def packet_callback(packet): operations = parser.add_mutually_exclusive_group(required=True) operations.add_argument("-l", "--listen", action="store_true", help="listen for incoming packets and try to parse and validate them") operations.add_argument("-g", "--generate", action="store_true", help="generate new packets of specific types") - parser.add_argument("-p", "--port", action="store", help="UDP port (for -l and -g)", default=19522, type=int) - parser.add_argument("-c", "--count", action="store", help="number of packet streams (if pld larger than mtu, otherwise packets) to generate or expect", default=10, type=int) + operations.add_argument("-a", "--parse", action="store_true", help="parse a pcap file. The recommended way to capture is something like this 'sudo tcpdump -s 200 -tttt -i enp7s0 udp \\( dst port 19522 or dst port 19523 \\) -w e2sar.pcap'") + parser.add_argument("-p", "--port", action="store", default=19522, type=int, help="UDP data port (only port for --lbre, starting port for --re)") + parser.add_argument("-y", "--syncport", action="store", help="UDP sync port", default=19010, type=int) + parser.add_argument("-n", "--nports", action="store", type=int, default=1, help="number of ports starting with -p to listen on for --re") + parser.add_argument("-c", "--count", action="store", default=10, type=int, help="number of events (if pld larger than mtu, otherwise packets) to generate or expect or parse") parser.add_argument("--ip", action="store", help="IP address to which to send the packet(s) or listen from") parser.add_argument("--show", action="store_true", default=False, help="only show the packet without sending it (with -g)") - parser.add_argument("--entropy", action="store", default=0, help="entropy value for LB+RE packet", type=int) - parser.add_argument("--event", action="store", default=0, help="event number for sync, LB+RE and RE packet", type=int) + parser.add_argument("--entropy", action="store", default=0, type=int, help="entropy value for LB+RE packet") + parser.add_argument("--event", action="store", default=0, type=int, help="event number for sync, LB+RE and RE packet") parser.add_argument("--rate", action="store", default=100, type=int, help="event rate in Hz for Sync packet") parser.add_argument("--dataid", action="store", default=1, type=int, help="data id for RE packet") parser.add_argument("--srcid", action="store", default=1, type=int, help="source id for Sync packet") parser.add_argument("--mtu", action="store", type=int, default=1500, help="set the MTU length, so LB+RE and RE packets can be fragmented.") - parser.add_argument("--pld", action="store", help="payload for LB+RE or RE packets. May be broken up if MTU size insufficient") + parser.add_argument("--pld", action="store", help="payload for LB+RE or RE packets. May be broken up if MTU size insufficient", default="This is a default payload.") parser.add_argument("--iface", action="store", default="all", help="which interface should we listen on (defaults to all)") + parser.add_argument("-f", "--file", action="store", help="pcap file name to parse", default="./e2sar.pcap") packet_types = parser.add_mutually_exclusive_group(required=True) - packet_types.add_argument("--sync", action="store_true", help="listen for or generate sync packets") - packet_types.add_argument("--lbre", action="store_true", help="listen for or generate packets with LB+RE header") - packet_types.add_argument("--re", action="store_true", help="listen for or generate packets with just the RE header") + packet_types.add_argument("--sync", action="store_true", help="listen for, parse or generate Sync packets") + packet_types.add_argument("--lbre", action="store_true", help="listen for, parse or generate packets with LB+RE header") + packet_types.add_argument("--re", action="store_true", help="listen for, parse or generate packets with just the RE header") + packet_types.add_argument("--lbresync", action="store_true", help="listen for or parse LB+RE and Sync packets") args = parser.parse_args() + # equivalent to -n - not to resolve port numbers to service names + conf.noenum.add(UDP.sport) + conf.noenum.add(UDP.dport) + if args.generate: if not args.ip: print(f'--ip option is required (use dotted notation to specify IPv4 address)') - sys.exit() + sys.exit(-1) # generate and show a packet + if args.lbresync: + print('Invalid option combination -g/--generate and --lbresync - multiple layers are bound, pick one') + sys.exit(-1) + + if args.sync: + # sync packets + bind_sync_hdr(args.syncport) + print(f'Generating Sync(port {args.syncport}) packets for mtu {args.mtu}') + elif args.lbre: + # lb+re packets + bind_lb_hdr(args.port) + print(f'Generating LBRE(port {args.port}) packets for mtu {args.mtu}') + elif args.re: + # re packets (only one port) + bind_re_hdr(args.port) + print(f'Generating RE(port {args.port}) packets for mtu {args.mtu}') + packets = list() for i in range(args.count): if args.sync: # sync packets - bind_sync_hdr(args.port) packets.append(genSyncPkt(args.ip, args.port, args.srcid, args.event + i, args.rate)) elif args.lbre: # lb+re packets - bind_lb_hdr(args.port) packets.extend(genLBREPkt(args.ip, args.port, args.entropy, args.dataid, args.event, args.pld, args.mtu)) elif args.re: - # re packets - bind_re_hdr(args.port) + # re packets (only one port) packets.extend(genREPkt(args.ip, args.port, args.dataid, args.event, args.pld, args.mtu)) if args.show: @@ -229,22 +275,32 @@ def packet_callback(packet): send(p) elif args.listen: # craft a filter + listeningPorts = [x + args.port for x in range(0, args.nports)] + portFilter = "or".join([f" dst port {port} " for port in listeningPorts]) if args.ip: - filter = f'ip dst host {args.ip} and udp dst port {args.port}' + filter = f'udp and dst host {args.ip} and \\( {portFilter} \\)' else: - filter=f'udp dst port {args.port}' + filter=f'udp {portFilter}' if args.sync: # sync packets - bind_sync_hdr(args.port) - packet_type = 'Sync' + bind_sync_hdr(args.syncport) + packet_type = f'Sync(port {args.syncport})' elif args.lbre: # lb+re packets bind_lb_hdr(args.port) - packet_type = 'LB+RE' + packet_type = f'LB+RE(port {args.port})' elif args.re: - bind_re_hdr(args.port) - packet_type = 'RE' + # multiple ports possible + portList = [x + args.port for x in range(0, args.nports)] + bind_re_hdr(portList) + packet_type = f'RE(ports {portList})' + elif args.lbresync: + # sync packets + bind_sync_hdr(args.syncport) + # lb+re packets on a single port + bind_lb_hdr(args.port) + packet_type = f'LB+RE(port {args.port}) + Sync(port {args.syncport})' # Linux supports "any" shortcut interface specification, but other OSs may not, so we just list # all the interfaces first @@ -256,5 +312,43 @@ def packet_callback(packet): # Start sniffing for packets sniff(iface=interfaces, filter=filter, prn=packet_callback, count=args.count) + + elif args.parse: + + if args.sync: + # sync packets + bind_sync_hdr(args.syncport) + packet_type = f'Sync(port {args.syncport})' + elif args.lbre: + # lb+re packets + bind_lb_hdr(args.port) + packet_type = f'LB+RE(port {args.port})' + elif args.re: + # multiple ports possible + portList = [x + args.port for x in range(0, args.nports)] + bind_re_hdr(portList) + packet_type = f'RE(ports {portList})' + elif args.lbresync: + # sync packets + bind_sync_hdr(args.syncport) + # lb+re packets on a single port + bind_lb_hdr(args.port) + packet_type = f'LB+RE(port {args.port}) + Sync(port {args.syncport})' + + print(f'Looking for {packet_type} packets in PCAP file {args.file}') + try: + if (args.count != 0): + packets = rdpcap(args.file, count=args.count) + else: + packets = rdpcap(args.file) + + for packet in packets: + packet_time = packet.time + if packet_time is not None: + human_readable_time = datetime.fromtimestamp(float(packet.time)) + print(f"Timestamp: {human_readable_time}") + packet_callback(packet) + except Exception as e: + print(f'Unable to parse file {args.file} due to exception: {e}') print('Finished') diff --git a/segmenter_config.ini b/segmenter_config.ini new file mode 100644 index 00000000..cf5b297d --- /dev/null +++ b/segmenter_config.ini @@ -0,0 +1,26 @@ +[general] +; enable control plane to send Sync packets +useCP = true + +[control-plane] +; sync thread period in milliseconds +syncPeriodMS = 1000 +; number of sync periods to use for averaging reported send rate +syncPeriods = 2 + +[data-plane] +; prefer V6 dataplane if the URI specifies both data=&data= addresses +dpV6 = false +; use zeroCopy send optimization +zeroCopy = false +; use connected sockets +connectedSocket = true +; size of the MTU to attempt to fit the segmented data in (must accommodate IP, UDP +; and LBRE headers) +mtu = 1500 +; number of sockets/source ports we will be sending data from. +; The more, the more randomness the LAG will see in delivering to different FPGA ports +numSendSockets = 4 +; socket buffer size for sending set via SO_SNDBUF setsockopt. +; Note that this requires systemwide max set via sysctl (net.core.wmem_max) to be higher +sndSocketBufSize = 3145728 \ No newline at end of file diff --git a/src/e2sarCP.cpp b/src/e2sarCP.cpp index 8f430f0e..4ae576cb 100644 --- a/src/e2sarCP.cpp +++ b/src/e2sarCP.cpp @@ -709,7 +709,7 @@ namespace e2sar result LBManager::makeSslOptionsFromFiles(std::string_view pem_root_certs, std::string_view pem_private_key, - std::string_view pem_cert_chain) + std::string_view pem_cert_chain) noexcept { auto root = read_file(pem_root_certs); auto priv = read_file(pem_private_key); @@ -726,7 +726,7 @@ namespace e2sar // just the server root cert (useful for self-signed) result LBManager::makeSslOptionsFromFiles( - std::string_view pem_root_certs) + std::string_view pem_root_certs) noexcept { auto root = read_file(pem_root_certs); diff --git a/src/e2sarDPReassembler.cpp b/src/e2sarDPReassembler.cpp index 5abe6538..39537b05 100644 --- a/src/e2sarDPReassembler.cpp +++ b/src/e2sarDPReassembler.cpp @@ -1,5 +1,8 @@ #include #include +#include +#include +#include #include "portable_endian.h" @@ -31,17 +34,18 @@ namespace e2sar error, integral); // control output } - Reassembler::Reassembler(const EjfatURI &uri, std::vector cpuCoreList, + Reassembler::Reassembler(const EjfatURI &uri, ip::address data_ip, u_int16_t starting_port, + std::vector cpuCoreList, const ReassemblerFlags &rflags): dpuri(uri), - lbman(dpuri, rflags.validateCert), + lbman(dpuri, rflags.validateCert, rflags.useHostAddress), epochMs{rflags.epoch_ms}, setPoint{rflags.setPoint}, Kp{rflags.Kp}, Ki{rflags.Ki}, Kd{rflags.Kd}, + weight{rflags.weight}, min_factor{rflags.min_factor}, max_factor{rflags.max_factor}, pidSampleBuffer(rflags.epoch_ms/rflags.period_ms), // ring buffer size (usually 10 = 1sec/100ms) cpuCoreList{cpuCoreList}, - dataIP{(rflags.dpV6 ? uri.get_dataAddrv6().value().first : uri.get_dataAddrv4().value().first)}, - dataPort{(rflags.dpV6 ? uri.get_dataAddrv6().value().second : uri.get_dataAddrv4().value().second)}, - dpV6{rflags.dpV6}, + dataIP{data_ip}, + dataPort{starting_port}, portRange{rflags.portRange != -1 ? rflags.portRange : get_PortRange(cpuCoreList.size())}, numRecvThreads{cpuCoreList.size()}, // as many as there are cores numRecvPorts{static_cast(portRange > 0 ? 2 << (portRange - 1): 1)}, @@ -50,7 +54,7 @@ namespace e2sar eventTimeout_ms{rflags.eventTimeout_ms}, rcvSocketBufSize{rflags.rcvSocketBufSize}, condLock{recvThreadMtx}, - sendStateThreadState(*this, rflags.cpV6, rflags.period_ms), + sendStateThreadState(*this, rflags.period_ms), useCP{rflags.useCP} { sanityChecks(); @@ -64,17 +68,17 @@ namespace e2sar assignPortsToThreads(); } - Reassembler::Reassembler(const EjfatURI &uri, size_t numRecvThreads, - const ReassemblerFlags &rflags): + Reassembler::Reassembler(const EjfatURI &uri, ip::address data_ip, u_int16_t starting_port, + size_t numRecvThreads, const ReassemblerFlags &rflags): dpuri(uri), - lbman(dpuri, rflags.validateCert), + lbman(dpuri, rflags.validateCert, rflags.useHostAddress), epochMs{rflags.epoch_ms}, setPoint{rflags.setPoint}, Kp{rflags.Kp}, Ki{rflags.Ki}, Kd{rflags.Kd}, + weight{rflags.weight}, min_factor{rflags.min_factor}, max_factor{rflags.max_factor}, pidSampleBuffer(rflags.epoch_ms/rflags.period_ms), // ring buffer size (usually 10 = 1sec/100ms) cpuCoreList{std::vector()}, // no core list given - dataIP{(rflags.dpV6 ? uri.get_dataAddrv6().value().first : uri.get_dataAddrv4().value().first)}, - dataPort{(rflags.dpV6 ? uri.get_dataAddrv6().value().second : uri.get_dataAddrv4().value().second)}, - dpV6{rflags.dpV6}, + dataIP{data_ip}, + dataPort{starting_port}, portRange{rflags.portRange != -1 ? rflags.portRange : get_PortRange(numRecvThreads)}, numRecvThreads{numRecvThreads}, numRecvPorts{static_cast(portRange > 0 ? 2 << (portRange - 1): 1)}, @@ -83,7 +87,7 @@ namespace e2sar eventTimeout_ms{rflags.eventTimeout_ms}, rcvSocketBufSize{rflags.rcvSocketBufSize}, condLock{recvThreadMtx}, - sendStateThreadState(*this, rflags.cpV6, rflags.period_ms), + sendStateThreadState(*this, rflags.period_ms), useCP{rflags.useCP} { sanityChecks(); @@ -132,8 +136,6 @@ namespace e2sar void Reassembler::RecvThreadState::_threadBody() { - std::set activeEventNumbers{}; - while(!reas.threadsStop) { fd_set curSet{fdSet}; @@ -149,13 +151,14 @@ namespace e2sar auto inWaiting = nowT - it->second->firstSegment; auto inWaiting_ms = boost::chrono::duration_cast(inWaiting); if (inWaiting_ms > boost::chrono::milliseconds(reas.eventTimeout_ms)) { + // check if this event number has been seen as lost + logLostEvent(it->first); // deallocate event (ood queue and event buffer) it->second->cleanup(recvBufferPool); delete it->second->event; // deallocate queue item delete it->second; it = eventsInProgress.erase(it); // erase returns the next element (or end()) - reas.recvStats.enqueueLoss++; } else { ++it; // Just advance the iterator if no deletion } @@ -294,7 +297,14 @@ namespace e2sar eventsInProgress.erase(std::make_pair(item->eventNum, item->dataId)); // queue it up for the user to receive - reas.enqueue(item); + auto ret = reas.enqueue(item); + // event lost on enqueuing + if (ret == 1) + { + logLostEvent(std::make_pair(item->eventNum, item->dataId)); + // free up the item + delete item; + } // update statistics reas.recvStats.eventSuccess++; @@ -391,13 +401,24 @@ namespace e2sar void Reassembler::SendStateThreadState::_threadBody() { + // get the time + auto nowT = boost::chrono::high_resolution_clock::now(); + auto nowUsec = boost::chrono::duration_cast(nowT.time_since_epoch()).count(); + UnixTimeMicro_t currentTimeMicros = static_cast(nowUsec); + + // create first PID sample with 0 error and integral values + PIDSample newSample{currentTimeMicros, 0.0, 0.0}; + // push a new entry onto the circular buffer ejecting the oldest + reas.pidSampleBuffer.push_back(newSample); + + // wait before entering the loop + auto until = nowT + boost::chrono::milliseconds(period_ms); + boost::this_thread::sleep_until(until); + while(!reas.threadsStop) { // periodically send state to control plane sampling the queue state - // Get the current time point - auto nowT = boost::chrono::high_resolution_clock::now(); - // principle of operation: // CP requires PID signal and queue fill state in order to come up // with a schedule for a new epoch. The CP and receiver nodes running @@ -414,47 +435,45 @@ namespace e2sar // // fillPercent is always reported as sampled in the current moment - // if there are no samples in the buffer just skip - if (reas.pidSampleBuffer.end() != reas.pidSampleBuffer.begin()) + // Get the current time point + auto nowT = boost::chrono::high_resolution_clock::now(); + auto nowUsec = boost::chrono::duration_cast(nowT.time_since_epoch()).count(); + UnixTimeMicro_t currentTimeMicros = static_cast(nowUsec); + + // at 100msec period and depth of 10 this should normally be about 1 sec + auto deltaTfloat = static_cast(currentTimeMicros - + reas.pidSampleBuffer.front().sampleTime)/1000000.; + + // sample queue state + auto fillPercent = static_cast(static_cast(reas.eventQueueDepth)/static_cast(reas.QSIZE)); + // get PID terms (PID value, error, integral accumulator) + auto PIDTuple = pid(reas.setPoint, fillPercent, + deltaTfloat, reas.Kp, reas.Ki, reas.Kd, + reas.pidSampleBuffer.front().error, + reas.pidSampleBuffer.back().integral); + + // create new PID sample using last error and integral accumulated value + PIDSample newSample{currentTimeMicros, PIDTuple.get<1>(), PIDTuple.get<2>()}; + // push a new entry onto the circular buffer ejecting the oldest + reas.pidSampleBuffer.push_back(newSample); + + // send update to CP + auto res = reas.lbman.sendState(fillPercent, PIDTuple.get<0>(), true); + if (res.has_error()) { - auto nowUsec = boost::chrono::duration_cast(nowT.time_since_epoch()).count(); - UnixTimeMicro_t currentTimeMicros = static_cast(nowUsec); - - // at 100msec period and depth of 10 this should normally be about 1 sec - auto deltaTfloat = static_cast(currentTimeMicros - - reas.pidSampleBuffer.begin()->sampleTime)/1000000.; - - // sample queue state - auto fillPercent = static_cast(static_cast(reas.eventQueueDepth)/static_cast(reas.QSIZE)); - // get PID terms (PID value, error, integral accumulator) - auto PIDTuple = pid(reas.setPoint, fillPercent, - deltaTfloat, reas.Kp, reas.Ki, reas.Kd, - reas.pidSampleBuffer.begin()->error, - reas.pidSampleBuffer.end()->integral); - - // create new PID sample using last error and integral accumulated value - PIDSample newSample{currentTimeMicros, PIDTuple.get<1>(), PIDTuple.get<2>()}; - // push a new entry onto the circular buffer ejecting the oldest - reas.pidSampleBuffer.push_back(newSample); - - // send update to CP - auto res = reas.lbman.sendState(fillPercent, PIDTuple.get<0>(), true); - if (res.has_error()) - { - // update error counts - reas.recvStats.grpcErrCnt++; - reas.recvStats.lastE2SARError = res.error().code(); - } + // update error counts + reas.recvStats.grpcErrCnt++; + reas.recvStats.lastE2SARError = res.error().code(); } + // sleep approximately so we wake up every ~100ms auto until = nowT + boost::chrono::milliseconds(period_ms); boost::this_thread::sleep_until(until); } } - result Reassembler::registerWorker(const std::string &node_name, float weight, - float min_factor, float max_factor) noexcept + result Reassembler::registerWorker(const std::string &node_name) noexcept { if (useCP) { @@ -523,4 +542,43 @@ namespace e2sar delete eventItem; return 0; } + + result Reassembler::ReassemblerFlags::getFromINI(const std::string &iniFile) noexcept + { + boost::property_tree::ptree paramTree; + Reassembler::ReassemblerFlags rFlags; + + try { + boost::property_tree::ini_parser::read_ini(iniFile, paramTree); + } catch(boost::property_tree::ini_parser_error &ie) { + return E2SARErrorInfo{E2SARErrorc::ParameterNotAvailable, + "Unable to parse the reassembler flags configuration file "s + iniFile}; + } + + // general + rFlags.useCP = paramTree.get("general.useCP", rFlags.useCP); + + // control plane + rFlags.useHostAddress = paramTree.get("control-plane.useHostAddress", rFlags.useHostAddress); + rFlags.validateCert = paramTree.get("control-plane.validateCert", rFlags.validateCert); + + // data plane + rFlags.portRange = paramTree.get("data-plane.portRange", rFlags.portRange); + rFlags.withLBHeader = paramTree.get("data-plane.withLBHeader", rFlags.withLBHeader); + rFlags.eventTimeout_ms = paramTree.get("data-plane.eventTimeoutMS", rFlags.eventTimeout_ms); + rFlags.rcvSocketBufSize = paramTree.get("data-plane.rcvSocketBufSize", rFlags.rcvSocketBufSize); + rFlags.epoch_ms = paramTree.get("data-plane.epochMS", rFlags.epoch_ms); + rFlags.period_ms = paramTree.get("data-plane.periodMS", rFlags.period_ms); + + // PID parameters + rFlags.setPoint = paramTree.get("pid.setPoint", rFlags.setPoint); + rFlags.Ki = paramTree.get("pid.Ki", rFlags.Ki); + rFlags.Kp = paramTree.get("pid.Kp", rFlags.Kp); + rFlags.Kd = paramTree.get("pid.Kd", rFlags.Kd); + rFlags.Kd = paramTree.get("pid.weight", rFlags.Kd); + rFlags.Kd = paramTree.get("pid.min_factor", rFlags.Kd); + rFlags.Kd = paramTree.get("pid.max_factor", rFlags.Kd); + + return rFlags; + } } diff --git a/src/e2sarDPSegmenter.cpp b/src/e2sarDPSegmenter.cpp index 4ac9f913..5814c2a2 100644 --- a/src/e2sarDPSegmenter.cpp +++ b/src/e2sarDPSegmenter.cpp @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include #include "portable_endian.h" @@ -594,4 +598,39 @@ namespace e2sar sendThreadCond.notify_one(); return 0; } + + result Segmenter::SegmenterFlags::getFromINI(const std::string &iniFile) noexcept + { + boost::property_tree::ptree paramTree; + Segmenter::SegmenterFlags sFlags; + + try { + boost::property_tree::ini_parser::read_ini(iniFile, paramTree); + } catch(boost::property_tree::ini_parser_error &ie) { + return E2SARErrorInfo{E2SARErrorc::ParameterNotAvailable, + "Unable to parse the segmenter flags configuration file "s + iniFile}; + } + + // general + sFlags.useCP = paramTree.get("general.useCP", sFlags.useCP); + + // control plane + sFlags.syncPeriods = paramTree.get("control-plane.syncPeriods", + sFlags.syncPeriods); + sFlags.syncPeriodMs = paramTree.get("control-plane.syncPeriodMS", + sFlags.syncPeriodMs); + + // data plane + sFlags.dpV6 = paramTree.get("data-plane.dpV6", sFlags.dpV6); + sFlags.zeroCopy = paramTree.get("data-plane.zeroCopy", sFlags.zeroCopy); + sFlags.connectedSocket = paramTree.get("data-plane.connectedSocket", + sFlags.connectedSocket); + sFlags.mtu = paramTree.get("data-plane.mtu", sFlags.mtu); + sFlags.numSendSockets = paramTree.get("data-plane.numSendSockets", + sFlags.numSendSockets); + sFlags.sndSocketBufSize = paramTree.get("data-plane.sndSocketBufSize", + sFlags.sndSocketBufSize); + + return sFlags; + } } diff --git a/src/e2sarNetUtil.cpp b/src/e2sarNetUtil.cpp index 4e228d1a..3b489f0f 100644 --- a/src/e2sarNetUtil.cpp +++ b/src/e2sarNetUtil.cpp @@ -3,7 +3,8 @@ namespace e2sar { /** - * Get MTU of a given interface + * Get MTU of a given interface. Used in constructors, so doesn't + * return error. */ u_int16_t NetUtil::getMTU(const std::string &interfaceName) { // Default MTU @@ -14,10 +15,21 @@ namespace e2sar strcpy(ifr.ifr_name, interfaceName.c_str()); if (!ioctl(sock, SIOCGIFMTU, &ifr)) { mtu = ifr.ifr_mtu; - } + } close(sock); return mtu; } + + result NetUtil::getHostName() { + char nameBuf[255]; + + if (!gethostname(nameBuf, 255)) + { + std::string ret{nameBuf}; + return ret; + } else + return E2SARErrorInfo{E2SARErrorc::SystemError, "Unable to retrieve hostname"}; + } #ifdef NETLINK_CAPABLE /** * Get the outgoing interface and its MTU for a given IPv4 or IPv6 diff --git a/src/pybind/py_e2sar.cpp b/src/pybind/py_e2sar.cpp index 928771fe..8a4aa218 100644 --- a/src/pybind/py_e2sar.cpp +++ b/src/pybind/py_e2sar.cpp @@ -129,7 +129,10 @@ void init_e2sarResultTypes(py::module_ &m) bind_result(m, "E2SARResultString"); bind_result(m, "E2SARResultEjfatURI"); bind_result(m, "E2SARResultSslCredentialsOptions"); - bind_result(m, "E2SARResultUInit32"); + bind_result(m, "E2SARResultUInt32"); bind_result>(m, "E2SARResultPairIP"); bind_result>(m, "E2SARResultPairString"); + bind_result>(m, "E2SARResultPairUInt64"); + bind_result(m, "E2SARResultReassemblerFlags"); + bind_result(m, "E2SARResultSegmenterFlags"); } diff --git a/src/pybind/py_e2sarCP.cpp b/src/pybind/py_e2sarCP.cpp index 82221793..5c802d6f 100644 --- a/src/pybind/py_e2sarCP.cpp +++ b/src/pybind/py_e2sarCP.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "e2sarUtil.hpp" #include "e2sarCP.hpp" @@ -47,7 +48,16 @@ void init_e2sarCP(py::module_ &m) { // Expose the grpc classes py::class_(e2sarCP, "WorkerStatus") - .def(py::init<>()); + .def(py::init<>()) + .def("get_name", &WorkerStatus::name) + .def("get_fill_percent", &WorkerStatus::fillpercent) + .def("get_control_signal", &WorkerStatus::controlsignal) + .def("get_slots_assigned", &WorkerStatus::slotsassigned) + .def("get_last_updated", + [](const WorkerStatus &self) { + return google::protobuf::util::TimeUtil::ToString(self.lastupdated()); + } + ); py::class_(e2sarCP, "LoadBalancerStatusReply") .def(py::init<>()); py::class_(e2sarCP, "OverviewReply") @@ -58,7 +68,11 @@ void init_e2sarCP(py::module_ &m) { .def("get_seconds", &google::protobuf::Timestamp::seconds) .def("get_nanos", &google::protobuf::Timestamp::nanos) .def("set_seconds", &google::protobuf::Timestamp::set_seconds) - .def("set_nanos", &google::protobuf::Timestamp::set_nanos); + .def("set_nanos", &google::protobuf::Timestamp::set_nanos) + // pretty print in Python: use TimeUtil::ToString to get the default string representation + .def("__str__", [](const google::protobuf::Timestamp& self) { + return google::protobuf::util::TimeUtil::ToString(self); + }); /** * Bindings for struct "LBWorkerStatus" @@ -78,12 +92,8 @@ void init_e2sarCP(py::module_ &m) { .def_readonly("slots_assigned", &LBWorkerStatus::slotsAssigned) .def_property_readonly("last_updated", [](const LBWorkerStatus &self) { - return convert_timestamp_to_python(self.lastUpdated); // Access the member directly + return google::protobuf::util::TimeUtil::ToString(self.lastUpdated); // Access the member directly } - // , - // [](LBWorkerStatus &self, py::object py_timestamp) { - // self.lastUpdated = convert_timestamp_to_cpp(py_timestamp); // Assign the converted value - // } ); /** @@ -101,32 +111,25 @@ void init_e2sarCP(py::module_ &m) { std::vector&, google::protobuf::Timestamp>(), py::arg("timestamp"), - py::arg("currentEpoch"), py::arg("currentPredictedEventNumber"), - py::arg("workers"), + py::arg("current_epoch"), + py::arg("current_predicted_event_number"), + py::arg("worker_status_list"), py::arg("sender_addresses"), - py::arg("expiresAt")) + py::arg("expires_at")) // Expose the struct members - .def_property("timestamp", + .def_property_readonly(/* a Python string */"timestamp", [](const LBStatus &self) { - return convert_timestamp_to_python(self.timestamp); // Access the member directly - }, - [](LBStatus &self, py::object py_timestamp) { - self.timestamp = convert_timestamp_to_cpp(py_timestamp); // Assign the converted value + return google::protobuf::util::TimeUtil::ToString(self.timestamp); }) .def_readonly("currentEpoch", &LBStatus::currentEpoch) .def_readonly("currentPredictedEventNumber", &LBStatus::currentPredictedEventNumber) - .def_readonly("workers", &LBStatus::workers) - .def_readonly("senderAddresses", &LBStatus::senderAddresses) - .def_property_readonly("expiresAt", + .def_readonly("workerStatusList", &LBStatus::workers) + .def_readonly("senderAddressList", &LBStatus::senderAddresses) + .def_property_readonly(/* a Python string */"expiresAt", [](const LBStatus &self) { - return convert_timestamp_to_python(self.expiresAt); // Access the member directly - } - // , - // [](LBStatus &self, py::object py_timestamp) { - // self.expiresAt= convert_timestamp_to_cpp(py_timestamp); // Assign the converted value - // } - ); + return google::protobuf::util::TimeUtil::ToString(self.expiresAt); // Access the member directly + }); /** * Bindings for struct "OverviewEntry" @@ -149,9 +152,10 @@ void init_e2sarCP(py::module_ &m) { // Constructor lb_manager.def( - py::init(), + py::init(), py::arg("cpuri"), py::arg("validate_server") = true, + py::arg("use_host_address") = false, py::arg("opts") = grpc::SslCredentialsOptions() ); @@ -277,7 +281,7 @@ void init_e2sarCP(py::module_ &m) { /** * Return type containing result> */ - lb_manager.def( + lb_manager.def( "get_lb_overview", [](LBManager& self){ auto result = self.overview(); @@ -290,5 +294,8 @@ void init_e2sarCP(py::module_ &m) { // Return an EjfatURI object. lb_manager.def("get_uri", &LBManager::get_URI, py::return_value_policy::reference); + // Return connect string. + lb_manager.def("get_addr_string", &LBManager::get_AddrString); + /// NOTE: donot need to bind LBManager::makeSslOptionsFromFiles } diff --git a/src/pybind/py_e2sarDP.cpp b/src/pybind/py_e2sarDP.cpp index 3380feb6..ab330282 100644 --- a/src/pybind/py_e2sarDP.cpp +++ b/src/pybind/py_e2sarDP.cpp @@ -31,7 +31,7 @@ void print_type(const T& param) { namespace py = pybind11; using namespace e2sar; -// Has to have a wrapper because of the callback function. +// Must have a wrapper because of the callback function. result addToSendQueueWrapper(Segmenter& seg, uint8_t *event, size_t bytes, int64_t _eventNum, uint16_t _dataId, uint16_t entropy, std::function callback, @@ -77,7 +77,9 @@ void init_e2sarDP_segmenter(py::module_ &m) .def_readwrite("syncPeriodMs", &Segmenter::SegmenterFlags::syncPeriodMs) .def_readwrite("syncPeriods", &Segmenter::SegmenterFlags::syncPeriods) .def_readwrite("mtu", &Segmenter::SegmenterFlags::mtu) - .def_readwrite("numSendSockets", &Segmenter::SegmenterFlags::numSendSockets); + .def_readwrite("numSendSockets", &Segmenter::SegmenterFlags::numSendSockets) + .def_readwrite("sndSocketBufSize", &Segmenter::SegmenterFlags::sndSocketBufSize) + .def("getFromINI", &Segmenter::SegmenterFlags::getFromINI); // Constructor seg.def( @@ -161,29 +163,45 @@ void init_e2sarDP_reassembler(py::module_ &m) // Bind the ReassemblerFlags struct as a nested class of Reassembler py::class_(m, "ReassemblerFlags") .def(py::init<>()) // The default values will be the same in Python after binding. - .def_readwrite("dpV6", &Reassembler::ReassemblerFlags::dpV6) - .def_readwrite("cpV6", &Reassembler::ReassemblerFlags::cpV6) .def_readwrite("useCP", &Reassembler::ReassemblerFlags::useCP) + .def_readwrite("useHostAddress", &Reassembler::ReassemblerFlags::useHostAddress) .def_readwrite("period_ms", &Reassembler::ReassemblerFlags::period_ms) .def_readwrite("validateCert", &Reassembler::ReassemblerFlags::validateCert) .def_readwrite("Ki", &Reassembler::ReassemblerFlags::Ki) .def_readwrite("Kp", &Reassembler::ReassemblerFlags::Kp) .def_readwrite("Kd", &Reassembler::ReassemblerFlags::Kd) + .def_readwrite("weight", &Reassembler::ReassemblerFlags::weight) + .def_readwrite("min_factor", &Reassembler::ReassemblerFlags::min_factor) + .def_readwrite("max_factor", &Reassembler::ReassemblerFlags::max_factor) .def_readwrite("setPoint", &Reassembler::ReassemblerFlags::setPoint) .def_readwrite("epoch_ms", &Reassembler::ReassemblerFlags::epoch_ms) .def_readwrite("portRange", &Reassembler::ReassemblerFlags::portRange) .def_readwrite("withLBHeader", &Reassembler::ReassemblerFlags::withLBHeader) - .def_readwrite("eventTimeout_ms", &Reassembler::ReassemblerFlags::eventTimeout_ms); + .def_readwrite("eventTimeout_ms", &Reassembler::ReassemblerFlags::eventTimeout_ms) + .def_readwrite("rcvSocketBufSize", &Reassembler::ReassemblerFlags::rcvSocketBufSize) + .def("getFromINI", &Reassembler::ReassemblerFlags::getFromINI); // Constructor reas.def( - py::init(), + py::init(), "Init the Reassembler object with number of recv threads.", - py::arg("uri"), // must-have arg when init + py::arg("uri"), // must-have args when init + py::arg("data_ip"), + py::arg("starting_port"), py::arg("num_recv_threads") = (size_t)1, py::arg("rflags") = Reassembler::ReassemblerFlags()); - // Recv events part. return py::tuple. + // Constructor with CPU core list. + reas.def( + py::init, const Reassembler::ReassemblerFlags &>(), + "Init the Reassembler object with a list of CPU cores.", + py::arg("uri"), // must-have args when init + py::arg("data_ip"), + py::arg("starting_port"), + py::arg("cpu_core_list"), + py::arg("rflags") = Reassembler::ReassemblerFlags()); + + // Recv events part. Return py::tuple. reas.def("getEvent", [](Reassembler& self, /* py::list is mutable */ py::list& recv_bytes ) -> py::tuple { @@ -253,10 +271,12 @@ void init_e2sarDP_reassembler(py::module_ &m) // Return type of result reas.def("OpenAndStart", &Reassembler::openAndStart); - /// TODO: to be test reas.def("registerWorker", &Reassembler::registerWorker); reas.def("deregisterWorker", &Reassembler::deregisterWorker); + // Return type of resultresult> + reas.def("get_LostEvent", &Reassembler::get_LostEvent); + // Return type of boost::tuple<>: convert to std::tuple reas.def("getStats", [](const Reassembler& reasObj) { auto stats = reasObj.getStats(); diff --git a/test/boost_test.cpp b/test/boost_test.cpp index 26d28d04..3bfe7614 100644 --- a/test/boost_test.cpp +++ b/test/boost_test.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include @@ -312,5 +314,39 @@ int main() std::cout << map[std::make_pair(0x123456, 1)] << std::endl; std::cout << map[std::make_pair(0x123456, 2)] << std::endl; std::cout << map[std::make_pair(0x1234567, 10)] << std::endl; + + + std::cout << "Test circular buffer" << std::endl; + + boost::circular_buffer pidSampleBuffer(5); + + for (int i = 0; i<6; i++) + pidSampleBuffer.push_back(i); + + std::cout << "Head of buffer " << pidSampleBuffer.front() << std::endl; + std::cout << "Tail of buffer " << pidSampleBuffer.back() << std::endl; + + for (int i = 10; i<20; i++) + pidSampleBuffer.push_back(i); + + std::cout << "Head of buffer " << pidSampleBuffer.front() << std::endl; + std::cout << "Tail of buffer " << pidSampleBuffer.back() << std::endl; + + std::cout << "Test allocate deallocate" << std::endl; + + boost::lockfree::queue*> lostEventsQueue{20}; + + for(int i=0; i<5; i++) + { + lostEventsQueue.push(new std::pair(i, i*10)); + } + + std::pair *res; + while(lostEventsQueue.pop(res)) + { + auto ret = *res; + std::cout << "Retrieved " << ret.first << ":" << ret.second << std::endl; + delete res; + } } diff --git a/test/e2sar_reas_live_test.cpp b/test/e2sar_reas_live_test.cpp index b8c8ab4e..9513eabf 100644 --- a/test/e2sar_reas_live_test.cpp +++ b/test/e2sar_reas_live_test.cpp @@ -36,8 +36,10 @@ BOOST_AUTO_TEST_CASE(DPReasTest1) Reassembler::ReassemblerFlags rflags; rflags.validateCert = false; + ip::address loopback = ip::make_address("127.0.0.1"); + u_int16_t listen_port = 10000; // create a reassembler and start the threads - Reassembler reas(uri, 1, rflags); + Reassembler reas(uri, loopback, listen_port, 1, rflags); auto oas_r = reas.openAndStart(); @@ -62,6 +64,13 @@ BOOST_AUTO_TEST_CASE(DPReasTest1) // data error count BOOST_CHECK(recvStats.get<4>() == 0); + auto lostEvent = reas.get_LostEvent(); + if (lostEvent.has_error()) + std::cout << "NO EVENT LOSS " << std::endl; + else + std::cout << "LOST EVENT " << lostEvent.value().first << ":" << lostEvent.value().second << std::endl; + BOOST_CHECK(lostEvent.has_error() && lostEvent.error().code() == E2SARErrorc::NotFound); + // stop threads and exit } diff --git a/test/e2sar_reas_test.cpp b/test/e2sar_reas_test.cpp index 714bb4de..32587f2b 100644 --- a/test/e2sar_reas_test.cpp +++ b/test/e2sar_reas_test.cpp @@ -8,6 +8,9 @@ #include #include #include +#include +#include +#include #include "e2sar.hpp" @@ -25,7 +28,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest1) std::cout << "DPReasTest1: Test segmentation and reassembly on local host with no control plane (no segmentation)" << std::endl; // create URI for segmenter - since we will turn off CP only the data part of the query is used - std::string segUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1"}; + std::string segUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:10000"}; // create URI for reassembler - since we turn off CP, none of it is actually used std::string reasUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1"}; @@ -51,7 +54,9 @@ BOOST_AUTO_TEST_CASE(DPReasTest1) rflags.useCP = false; // turn off CP rflags.withLBHeader = true; // LB header will be attached since there is no LB - Reassembler reas(reasUri, 1, rflags); + ip::address loopback = ip::make_address("127.0.0.1"); + u_int16_t listen_port = 10000; + Reassembler reas(reasUri, loopback, listen_port, 1, rflags); std::cout << "This reassembler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -131,6 +136,13 @@ BOOST_AUTO_TEST_CASE(DPReasTest1) BOOST_CHECK(recvStats.get<4>() == 0); // no data errors BOOST_CHECK(recvStats.get<5>() == E2SARErrorc::NoError); // no error + auto lostEvent = reas.get_LostEvent(); + if (lostEvent.has_error()) + std::cout << "NO EVENT LOSS " << std::endl; + else + std::cout << "LOST EVENT " << lostEvent.value().first << ":" << lostEvent.value().second << std::endl; + BOOST_CHECK(lostEvent.has_error() && lostEvent.error().code() == E2SARErrorc::NotFound); + // stop threads and exit } catch (E2SARException &ee) { @@ -163,7 +175,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest2) std::cout << "DPReasTest2: Test segmentation and reassembly on local host with no control plane (basic segmentation)" << std::endl; // create URI for segmenter - since we will turn off CP only the data part of the query is used - std::string segUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1"}; + std::string segUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:10000"}; // create URI for reassembler - since we turn off CP, none of it is actually used std::string reasUriString{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1"}; @@ -190,7 +202,9 @@ BOOST_AUTO_TEST_CASE(DPReasTest2) rflags.useCP = false; // turn off CP rflags.withLBHeader = true; // LB header will be attached since there is no LB - Reassembler reas(reasUri, 1, rflags); + ip::address loopback = ip::make_address("127.0.0.1"); + u_int16_t listen_port = 10000; + Reassembler reas(reasUri, loopback, listen_port, 1, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -269,6 +283,13 @@ BOOST_AUTO_TEST_CASE(DPReasTest2) BOOST_CHECK(recvStats.get<4>() == 0); // no data errors BOOST_CHECK(recvStats.get<5>() == E2SARErrorc::NoError); // no error + auto lostEvent = reas.get_LostEvent(); + if (lostEvent.has_error()) + std::cout << "NO EVENT LOSS " << std::endl; + else + std::cout << "LOST EVENT " << lostEvent.value().first << ":" << lostEvent.value().second << std::endl; + BOOST_CHECK(lostEvent.has_error() && lostEvent.error().code() == E2SARErrorc::NotFound); + // stop threads and exit } catch (E2SARException &ee) { @@ -306,9 +327,11 @@ BOOST_AUTO_TEST_CASE(DPReasTest3) // create reassembler with no control plane Reassembler::ReassemblerFlags rflags; + ip::address loopback = ip::make_address("127.0.0.1"); + u_int16_t listen_port = 19522; { // one thread - Reassembler reas(reasUri, 1, rflags); + Reassembler reas(reasUri, loopback, listen_port, 1, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -321,7 +344,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest3) { // 4 threads - Reassembler reas(reasUri, 4, rflags); + Reassembler reas(reasUri, loopback, listen_port, 4, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -334,7 +357,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest3) { // 7 threads - Reassembler reas(reasUri, 7, rflags); + Reassembler reas(reasUri, loopback, listen_port, 7, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -348,7 +371,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest3) { // 4 threads with portRange override rflags.portRange = 10; - Reassembler reas(reasUri, 4, rflags); + Reassembler reas(reasUri, loopback, listen_port, 4, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -361,7 +384,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest3) { // 4 threads with low portRange override rflags.portRange = 1; - Reassembler reas(reasUri, 4, rflags); + Reassembler reas(reasUri, loopback, listen_port, 4, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -399,7 +422,7 @@ BOOST_AUTO_TEST_CASE(DPReasTest4) std::cout << "DPReasTest4: Test segmentation and reassembly on local host with no control plane (with segmentation and multiple senders)" << std::endl; // create URIs for segmenters - since we will turn off CP only the data part of the query is used - std::string segUriString1{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1"}; + std::string segUriString1{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:19522"}; std::string segUriString2{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:19523"}; std::string segUriString3{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:19524"}; std::string segUriString4{"ejfat://useless@192.168.100.1:9876/lb/1?sync=192.168.0.1:12345&data=127.0.0.1:19525"}; @@ -436,8 +459,10 @@ BOOST_AUTO_TEST_CASE(DPReasTest4) rflags.withLBHeader = true; // LB header will be attached since there is no LB rflags.portRange = 2; + ip::address loopback = ip::make_address("127.0.0.1"); + u_int16_t listen_port = 19522; // 1 thread for 4 ports - Reassembler reas(reasUri, 1, rflags); + Reassembler reas(reasUri, loopback, listen_port, 1, rflags); std::cout << "This reassmebler has " << reas.get_numRecvThreads() << " receive threads and is listening on ports " << reas.get_recvPorts().first << ":" << reas.get_recvPorts().second << " using portRange " << reas.get_portRange() << @@ -602,6 +627,13 @@ BOOST_AUTO_TEST_CASE(DPReasTest4) BOOST_CHECK(recvStats.get<4>() == 0); // no data errors BOOST_CHECK(recvStats.get<5>() == E2SARErrorc::NoError); // no error + auto lostEvent = reas.get_LostEvent(); + if (lostEvent.has_error()) + std::cout << "NO EVENT LOSS " << std::endl; + else + std::cout << "LOST EVENT " << lostEvent.value().first << ":" << lostEvent.value().second << std::endl; + BOOST_CHECK(lostEvent.has_error() && lostEvent.error().code() == E2SARErrorc::NotFound); + // stop threads and exit } catch (E2SARException &ee) { @@ -626,4 +658,38 @@ BOOST_AUTO_TEST_CASE(DPReasTest4) } } +BOOST_AUTO_TEST_CASE(DPReasTest5) +{ + // test reading SegmenterFlags from INI files + // generate a file, read it in and compare expected values + boost::property_tree::ptree paramTree; + Reassembler::ReassemblerFlags rFlags; + std::string iniFileName = "/tmp/reassembler.ini"; + + // fill in the parameters + paramTree.put("general.useCP", false); + paramTree.put("control-plane.useHostAddress", true); + paramTree.put("data-plane.rcvSocketBufSize", 10000); + + try { + boost::property_tree::ini_parser::write_ini(iniFileName, paramTree); + } catch(boost::property_tree::ini_parser_error &ie) { + std::cout << "Unable to parse the segmenter flags configuration file "s + iniFileName << std::endl; + BOOST_CHECK(false); + } + + Reassembler::ReassemblerFlags segDefaults; + Reassembler::ReassemblerFlags readFlags; + auto res = Reassembler::ReassemblerFlags::getFromINI(iniFileName); + BOOST_CHECK(!res.has_error()); + readFlags = res.value(); + + BOOST_CHECK(readFlags.useCP == paramTree.get("general.useCP")); + BOOST_CHECK(readFlags.useHostAddress == paramTree.get("control-plane.useHostAddress")); + BOOST_CHECK(readFlags.validateCert == segDefaults.validateCert); + BOOST_CHECK(readFlags.rcvSocketBufSize == paramTree.get("data-plane.rcvSocketBufSize")); + + std::remove(iniFileName.c_str()); +} + BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/test/e2sar_seg_test.cpp b/test/e2sar_seg_test.cpp index 3c1033b5..a5184684 100644 --- a/test/e2sar_seg_test.cpp +++ b/test/e2sar_seg_test.cpp @@ -1,6 +1,9 @@ #define BOOST_TEST_MODULE DPSegTests #include #include +#include +#include +#include #include #include #include @@ -8,6 +11,9 @@ #include #include #include +#include +#include +#include #include "e2sar.hpp" @@ -332,4 +338,37 @@ BOOST_AUTO_TEST_CASE(DPSegTest4) // stop threads and exit } +BOOST_AUTO_TEST_CASE(DPSegTest5) +{ + // test reading SegmenterFlags from INI files + // generate a file, read it in and compare expected values + boost::property_tree::ptree paramTree; + Segmenter::SegmenterFlags sFlags; + std::string iniFileName = "/tmp/segmenter.ini"; + + // fill in the parameters + paramTree.put("general.useCP", false); + paramTree.put("data-plane.zeroCopy", true); + paramTree.put("data-plane.sndSocketBufSize", 10000); + + try { + boost::property_tree::ini_parser::write_ini(iniFileName, paramTree); + } catch(boost::property_tree::ini_parser_error &ie) { + std::cout << "Unable to parse the segmenter flags configuration file "s + iniFileName << std::endl; + BOOST_CHECK(false); + } + + Segmenter::SegmenterFlags segDefaults; + Segmenter::SegmenterFlags readFlags; + auto res = Segmenter::SegmenterFlags::getFromINI(iniFileName); + BOOST_CHECK(!res.has_error()); + readFlags = res.value(); + + BOOST_CHECK(readFlags.useCP == paramTree.get("general.useCP")); + BOOST_CHECK(readFlags.zeroCopy == paramTree.get("data-plane.zeroCopy")); + BOOST_CHECK(readFlags.dpV6 == segDefaults.dpV6); + BOOST_CHECK(readFlags.sndSocketBufSize == paramTree.get("data-plane.sndSocketBufSize")); + + std::remove(iniFileName.c_str()); +} BOOST_AUTO_TEST_SUITE_END() diff --git a/wiki b/wiki index 6d09ff0d..e34dc9a5 160000 --- a/wiki +++ b/wiki @@ -1 +1 @@ -Subproject commit 6d09ff0dceb94c58e0af6e71eb45758e888ff04d +Subproject commit e34dc9a5167c301d3ad3efc9d800a4b1b8693c08