diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index d329292425e2..07d10927cbc2 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -3279,30 +3279,16 @@ Topics: File: recommended-infrastructure-practices - Name: Recommended etcd practices File: recommended-etcd-practices +- Name: Telco core reference design + Dir: telco_core_ref_design_specs + Topics: + - Name: Telco core reference design specification + File: telco-core-rds - Name: Telco RAN DU reference design Dir: telco_ran_du_ref_design_specs Topics: - - Name: Telco RAN DU RDS + - Name: Telco RAN DU reference design specification File: telco-ran-du-rds -- Name: Reference design specifications - Dir: telco_ref_design_specs - Distros: openshift-origin,openshift-enterprise - Topics: - - Name: Telco reference design specifications - File: telco-ref-design-specs-overview - - Name: Telco core reference design specification - Dir: core - Topics: - - Name: Telco core reference design overview - File: telco-core-rds-overview - - Name: Telco core use model overview - File: telco-core-rds-use-cases - - Name: Core reference design components - File: telco-core-ref-design-components - - Name: Core reference design configuration CRs - File: telco-core-ref-crs - - Name: Telco core software specifications - File: telco-core-ref-software-artifacts - Name: Comparing cluster configurations Dir: cluster-compare Distros: openshift-origin,openshift-enterprise diff --git a/images/openshift-telco-core-rds-metallb-service-separation.png b/images/openshift-telco-core-rds-metallb-service-separation.png new file mode 100644 index 000000000000..cc7aa2bc38a8 Binary files /dev/null and b/images/openshift-telco-core-rds-metallb-service-separation.png differ diff --git a/images/openshift-telco-core-rds-networking.png b/images/openshift-telco-core-rds-networking.png new file mode 100644 index 000000000000..c01e38f34898 Binary files /dev/null and b/images/openshift-telco-core-rds-networking.png differ diff --git a/modules/telco-core-about-the-telco-core-cluster-use-model.adoc b/modules/telco-core-about-the-telco-core-cluster-use-model.adoc new file mode 100644 index 000000000000..aeaa5b37b19e --- /dev/null +++ b/modules/telco-core-about-the-telco-core-cluster-use-model.adoc @@ -0,0 +1,23 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-about-the-telco-core-cluster-use-model_{context}"] += About the telco core cluster use model + +The telco core cluster use model is designed for clusters that run on commodity hardware. +Telco core clusters support large scale telco applications including control plane functions such as signaling, aggregation, and session border controller (SBC); and centralized data plane functions such as 5G user plane functions (UPF). +Telco core cluster functions require scalability, complex networking support, resilient software-defined storage, and support performance requirements that are less stringent and constrained than far-edge RAN deployments. + +.Telco core RDS cluster service-based architecture and networking topology +image::openshift-5g-core-cluster-architecture-networking.png[5G core cluster showing a service-based architecture with overlaid networking topology] + +Networking requirements for telco core functions vary widely across a range of networking features and performance points. +IPv6 is a requirement and dual-stack is common. +Some functions need maximum throughput and transaction rate and require support for user-plane DPDK networking. +Other functions use more typical cloud-native patterns and can rely on OVN-Kubernetes, kernel networking, and load balancing. + +Telco core clusters are configured as standard with three control plane and two or more worker nodes configured with the stock (non-RT) kernel. +In support of workloads with varying networking and performance requirements, you can segment worker nodes by using `MachineConfigPool` custom resources (CR), for example, for non-user data plane or high-throughput use cases. +In support of required telco operational features, core clusters have a standard set of Day 2 OLM-managed Operators installed. diff --git a/modules/telco-core-additional-storage-solutions.adoc b/modules/telco-core-additional-storage-solutions.adoc new file mode 100644 index 000000000000..f1606c59f515 --- /dev/null +++ b/modules/telco-core-additional-storage-solutions.adoc @@ -0,0 +1,11 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-additional-storage-solutions_{context}"] += Additional storage solutions +You can use other storage solutions to provide persistent storage for telco core clusters. +The configuration and integration of these solutions is outside the scope of the reference design specification (RDS). + +Integration of the storage solution into the telco core cluster must include proper sizing and performance analysis to ensure the storage meets overall performance and resource usage requirements. diff --git a/modules/telco-core-agent-based-installer.adoc b/modules/telco-core-agent-based-installer.adoc new file mode 100644 index 000000000000..3ddf02fe6e33 --- /dev/null +++ b/modules/telco-core-agent-based-installer.adoc @@ -0,0 +1,33 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-agent-based-installer_{context}"] += Agent-based Installer + +New in this release:: +* No reference design updates in this release + +Description:: ++ +-- +Telco core clusters can be installed by using the Agent-based Installer. +This method allows you to install OpenShift on bare-metal servers without requiring additional servers or VMs for managing the installation. +The Agent-based Installer can be run on any system (for example, from a laptop) to generate an ISO installation image. +The ISO is used as the installation media for the cluster supervisor nodes. +Installation progress can be monitored using the ABI tool from any system with network connectivity to the supervisor node's API interfaces. + +ABI supports the following: + +* Installation from declarative CRs +* Installation in disconnected environments +* Installation with no additional supporting install or bastion servers required to complete the installation +-- + +Limits and requirements:: +* Disconnected installation requires a registry that is reachable from the installed host, with all required content mirrored in that registry. + +Engineering considerations:: +* Networking configuration should be applied as NMState configuration during installation. +Day 2 networking configuration using the NMState Operator is not supported. diff --git a/modules/telco-core-application-workloads.adoc b/modules/telco-core-application-workloads.adoc new file mode 100644 index 000000000000..143e861f1f48 --- /dev/null +++ b/modules/telco-core-application-workloads.adoc @@ -0,0 +1,37 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-application-workloads_{context}"] += Application workloads + +Application workloads running on telco core clusters can include a mix of high performance cloud-native network functions (CNFs) and traditional best-effort or burstable pod workloads. + +Guaranteed QoS scheduling is available to pods that require exclusive or dedicated use of CPUs due to performance or security requirements. +Typically, pods that run high performance or latency sensitive CNFs by using user plane networking (for example, DPDK) require exclusive use of dedicated whole CPUs achieved through node tuning and guaranteed QoS scheduling. +When creating pod configurations that require exclusive CPUs, be aware of the potential implications of hyper-threaded systems. +Pods should request multiples of 2 CPUs when the entire core (2 hyper-threads) must be allocated to the pod. + +Pods running network functions that do not require high throughput or low latency networking should be scheduled with best-effort or burstable QoS pods and do not require dedicated or isolated CPU cores. + +Engineering considerations:: ++ +-- +Use the following information to plan telco core workloads and cluster resources: + +* CNF applications should conform to the latest version of https://redhat-best-practices-for-k8s.github.io/guide/[Red Hat Best Practices for Kubernetes]. +* Use a mix of best-effort and burstable QoS pods as required by your applications. +** Use guaranteed QoS pods with proper configuration of reserved or isolated CPUs in the `PerformanceProfile` CR that configures the node. +** Guaranteed QoS Pods must include annotations for fully isolating CPUs. +** Best effort and burstable pods are not guaranteed exclusive CPU use. +Workloads can be preempted by other workloads, operating system daemons, or kernel tasks. +* Use exec probes sparingly and only when no other suitable option is available. +** Do not use exec probes if a CNF uses CPU pinning. +Use other probe implementations, for example, `httpGet` or `tcpSocket`. +** When you need to use exec probes, limit the exec probe frequency and quantity. +The maximum number of exec probes must be kept below 10, and the frequency must not be set to less than 10 seconds. +** You can use startup probes, because they do not use significant resources at steady-state operation. +This limitation on exec probes applies primarily to liveness and readiness probes. +Exec probes cause much higher CPU usage on management cores compared to other probe types because they require process forking. +-- diff --git a/modules/telco-core-cluster-common-use-model-engineering-considerations.adoc b/modules/telco-core-cluster-common-use-model-engineering-considerations.adoc new file mode 100644 index 000000000000..28a4fe93d8c4 --- /dev/null +++ b/modules/telco-core-cluster-common-use-model-engineering-considerations.adoc @@ -0,0 +1,48 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-cluster-common-use-model-engineering-considerations_{context}"] += Telco core cluster common use model engineering considerations + +* Cluster workloads are detailed in "Application workloads". +* Worker nodes should run on either of the following CPUs: +** Intel 3rd Generation Xeon (IceLake) CPUs or better when supported by {product-title}, or CPUs with the silicon security bug (Spectre and similar) mitigations turned off. +Skylake and older CPUs can experience 40% transaction performance drops when Spectre and similar mitigations are enabled. +** AMD EPYC Zen 4 CPUs (Genoa, Bergamo, or newer) or better when supported by {product-title}. ++ +[NOTE] +==== +Currently, per-pod power management is not available for AMD CPUs. +==== +** IRQ balancing is enabled on worker nodes. +The `PerformanceProfile` CR sets `globallyDisableIrqLoadBalancing` to false. +Guaranteed QoS pods are annotated to ensure isolation as described in "CPU partitioning and performance tuning". + +* All cluster nodes should have the following features: +** Have Hyper-Threading enabled +** Have x86_64 CPU architecture +** Have the stock (non-realtime) kernel enabled +** Are not configured for workload partitioning + +* The balance between power management and maximum performance varies between machine config pools in the cluster. +The following configurations should be consistent for all nodes in a machine config pools group. +** Cluster scaling. +See "Scalability" for more information. +** Clusters should be able to scale to at least 120 nodes. + +* CPU partitioning is configured using a `PerformanceProfile` CR and is applied to nodes on a per `MachineConfigPool` basis. +See "CPU partitioning and performance tuning" for additional considerations. +* CPU requirements for {product-title} depend on the configured feature set and application workload characteristics. +For a cluster configured according to the reference configuration running a simulated workload of 3000 pods as created by the kube-burner node-density test, the following CPU requirements are validated: +** The minimum number of reserved CPUs for control plane and worker nodes is 2 CPUs (4 hyper-threads) per NUMA node. +** The NICs used for non-DPDK network traffic should be configured to use at least 16 RX/TX queues. +** Nodes with large numbers of pods or other resources might require additional reserved CPUs. +The remaining CPUs are available for user workloads. + ++ +[NOTE] +==== +Variations in {product-title} configuration, workload size, and workload characteristics require additional analysis to determine the effect on the number of required CPUs for the OpenShift platform. +==== diff --git a/modules/telco-core-cluster-network-operator.adoc b/modules/telco-core-cluster-network-operator.adoc index 4e3f1dd9aa33..e03ebc26a5d4 100644 --- a/modules/telco-core-cluster-network-operator.adoc +++ b/modules/telco-core-cluster-network-operator.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-cluster-network-operator_{context}"] @@ -10,27 +10,35 @@ New in this release:: * No reference design updates in this release Description:: -The Cluster Network Operator (CNO) deploys and manages the cluster network components including the default OVN-Kubernetes network plugin during {product-title} cluster installation. It allows configuring primary interface MTU settings, OVN gateway modes to use node routing tables for pod egress, and additional secondary networks such as MACVLAN. ++ +-- +The Cluster Network Operator (CNO) deploys and manages the cluster network components including the default OVN-Kubernetes network plugin during cluster installation. +The CNO allows for configuring primary interface MTU settings, OVN gateway modes to use node routing tables for pod egress, and additional secondary networks such as MACVLAN. + +In support of network traffic separation, multiple network interfaces are configured through the CNO. +Traffic steering to these interfaces is configured through static routes applied by using the NMState Operator. +To ensure that pod traffic is properly routed, OVN-K is configured with the `routingViaHost` option enabled. +This setting uses the kernel routing table and the applied static routes rather than OVN for pod egress traffic. + +The Whereabouts CNI plugin is used to provide dynamic IPv4 and IPv6 addressing for additional pod network interfaces without the use of a DHCP server. +-- Limits and requirements:: * OVN-Kubernetes is required for IPv6 support. - * Large MTU cluster support requires connected network equipment to be set to the same or larger value. - +MTU size up to 8900 is supported. +//https://issues.redhat.com/browse/CNF-10593 * MACVLAN and IPVLAN cannot co-locate on the same main interface due to their reliance on the same underlying kernel mechanism, specifically the `rx_handler`. This handler allows a third-party module to process incoming packets before the host processes them, and only one such handler can be registered per network interface. Since both MACVLAN and IPVLAN need to register their own `rx_handler` to function, they conflict and cannot coexist on the same interface. -See link:https://elixir.bootlin.com/linux/v6.10.2/source/drivers/net/ipvlan/ipvlan_main.c#L82[ipvlan/ipvlan_main.c#L82] and link:https://elixir.bootlin.com/linux/v6.10.2/source/drivers/net/macvlan.c#L1260[net/macvlan.c#L1260] for details. - -* Alternative NIC configurations include splitting the shared NIC into multiple NICs or using a single dual-port NIC. -+ -[IMPORTANT] -==== -Splitting the shared NIC into multiple NICs or using a single dual-port NIC has not been validated with the telco core reference design. -==== - -* Single-stack IP cluster not validated. - +Review the source code for more details: +** https://elixir.bootlin.com/linux/v6.10.2/source/drivers/net/ipvlan/ipvlan_main.c#L82[linux/v6.10.2/source/drivers/net/ipvlan/ipvlan_main.c#L82] +** https://elixir.bootlin.com/linux/v6.10.2/source/drivers/net/macvlan.c#L1260[linux/v6.10.2/source/drivers/net/macvlan.c#L1260] +* Alternative NIC configurations include splitting the shared NIC into multiple NICs or using a single dual-port NIC, though they have not been tested and validated. +* Clusters with single-stack IP configuration are not validated. +* The `reachabilityTotalTimeoutSeconds` parameter in the `Network` CR configures the `EgressIP` node reachability check total timeout in seconds. +The recommended value is `1` second. Engineering considerations:: -* Pod egress traffic is handled by kernel routing table with the `routingViaHost` option. Appropriate static routes must be configured in the host. +* Pod egress traffic is handled by kernel routing table using the `routingViaHost` option. +Appropriate static routes must be configured in the host. diff --git a/modules/telco-core-common-baseline-model.adoc b/modules/telco-core-common-baseline-model.adoc new file mode 100644 index 000000000000..d5cc07d39a07 --- /dev/null +++ b/modules/telco-core-common-baseline-model.adoc @@ -0,0 +1,47 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-common-baseline-model_{context}"] += Telco core common baseline model + +The following configurations and use models are applicable to all telco core use cases. +The telco core use cases build on this common baseline of features. + +Cluster topology:: +Telco core clusters conform to the following requirements: + +* High availability control plane (three or more control plane nodes) +* Non-schedulable control plane nodes +* Multiple machine config pools + +Storage:: +Telco core use cases require persistent storage as provided by {rh-storage-first}. + +Networking:: +Telco core cluster networking conforms to the following requirements: + +* Dual stack IPv4/IPv6 (IPv4 primary). +* Fully disconnected – clusters do not have access to public networking at any point in their lifecycle. +* Supports multiple networks. +Segmented networking provides isolation between operations, administration and maintenance (OAM), signaling, and storage traffic. +* Cluster network type is OVN-Kubernetes as required for IPv6 support. +* Telco core clusters have multiple layers of networking supported by underlying RHCOS, SR-IOV Network Operator, Load Balancer and other components. +These layers include the following: +** Cluster networking layer. +The cluster network configuration is defined and applied through the installation configuration. +Update the configuration during Day 2 operations with the NMState Operator. +Use the initial configuration to establish the following: +*** Host interface configuration. +*** Active/active bonding (LACP). +** Secondary/additional network layer. +Configure the {product-title} CNI through network `additionalNetwork` or `NetworkAttachmentDefinition` CRs. +Use the initial configuration to configure MACVLAN virtual network interfaces. +** Application workload layer. +User plane networking runs in cloud-native network functions (CNFs). + +Service Mesh:: +Telco CNFs can use Service Mesh. +All telco core clusters require a Service Mesh implementation. +The choice of implementation and configuration is outside the scope of this specification. diff --git a/modules/telco-core-cpu-partitioning-and-performance-tuning.adoc b/modules/telco-core-cpu-partitioning-and-performance-tuning.adoc new file mode 100644 index 000000000000..7ec4980a357a --- /dev/null +++ b/modules/telco-core-cpu-partitioning-and-performance-tuning.adoc @@ -0,0 +1,57 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-cpu-partitioning-and-performance-tuning_{context}"] += CPU partitioning and performance tuning + +New in this release:: +* No reference design updates in this release + +Description:: +CPU partitioning improves performance and reduces latency by separating sensitive workloads from general-purpose tasks, interrupts, and driver work queues. +The CPUs allocated to those auxiliary processes are referred to as _reserved_ in the following sections. +In a system with Hyper-Threading enabled, a CPU is one hyper-thread. + +Limits and requirements:: +* The operating system needs a certain amount of CPU to perform all the support tasks, including kernel networking. +** A system with just user plane networking applications (DPDK) needs at least one core (2 hyper-threads when enabled) reserved for the operating system and the infrastructure components. +* In a system with Hyper-Threading enabled, core sibling threads must always be in the same pool of CPUs. +* The set of reserved and isolated cores must include all CPU cores. +* Core 0 of each NUMA node must be included in the reserved CPU set. +* Low latency workloads require special configuration to avoid being affected by interrupts, kernel scheduler, or other parts of the platform. +For more information, see "Creating a performance profile". + +Engineering considerations:: +* The minimum reserved capacity (`systemReserved`) required can be found by following the guidance in the link:https://access.redhat.com/solutions/5843241[Which amount of CPU and memory are recommended to reserve for the system in OpenShift 4 nodes?] Knowledgebase article. +* The actual required reserved CPU capacity depends on the cluster configuration and workload attributes. +* The reserved CPU value must be rounded up to a full core (2 hyper-threads) alignment. +* Changes to CPU partitioning cause the nodes contained in the relevant machine config pool to be drained and rebooted. +* The reserved CPUs reduce the pod density, because the reserved CPUs are removed from the allocatable capacity of the {product-title} node. +* The real-time workload hint should be enabled for real-time capable workloads. +** Applying the real time `workloadHint` setting results in the `nohz_full` kernel command line parameter being applied to improve performance of high performance applications. +When you apply the `workloadHint` setting, any isolated or burstable pods that do not have the `cpu-quota.crio.io: "disable"` annotation and a proper `runtimeClassName` value, are subject to CRI-O rate limiting. +When you set the `workloadHint` parameter, be aware of the tradeoff between increased performance and the potential impact of CRI-O rate limiting. +Ensure that required pods are correctly annotated. +* Hardware without IRQ affinity support affects isolated CPUs. +All server hardware must support IRQ affinity to ensure that pods with guaranteed CPU QoS can fully use allocated CPUs. +* OVS dynamically manages its `cpuset` entry to adapt to network traffic needs. +You do not need to reserve an additional CPU for handling high network throughput on the primary CNI. +* If workloads running on the cluster use kernel level networking, the RX/TX queue count for the participating NICs should be set to 16 or 32 queues if the hardware permits it. +Be aware of the default queue count. +With no configuration, the default queue count is one RX/TX queue per online CPU; which can result in too many interrupts being allocated. ++ +[NOTE] +==== +Some drivers do not deallocate the interrupts even after reducing the queue count. +==== + +* If workloads running on the cluster require cgroup v1, you can configure nodes to use cgroup v1 as part of the initial cluster deployment. +See "Enabling Linux control group version 1 (cgroup v1)" and link:https://www.redhat.com/en/blog/rhel-9-changes-context-red-hat-openshift-workloads[Red Hat Enterprise Linux 9 changes in the context of Red Hat OpenShift workloads]. ++ +[NOTE] +==== +Support for cgroup v1 is planned for removal in {product-title} 4.19. +Clusters running cgroup v1 must transition to cgroup v2. +==== diff --git a/modules/telco-core-crs-cluster-infrastructure.adoc b/modules/telco-core-crs-cluster-infrastructure.adoc new file mode 100644 index 000000000000..fa46cd75cc73 --- /dev/null +++ b/modules/telco-core-crs-cluster-infrastructure.adoc @@ -0,0 +1,25 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="cluster-infrastructure-crs_{context}"] += Cluster infrastructure reference CRs + +.Cluster infrastructure CRs +[cols="4*", options="header", format=csv] +|==== +Component,Reference CR,Description,Optional +Cluster logging,`ClusterLogForwarder.yaml`,Configures a log forwarding instance with the specified service account and verifies that the configuration is valid.,Yes +Cluster logging,`ClusterLogNS.yaml`,Configures the cluster logging namespace.,Yes +Cluster logging,`ClusterLogOperGroup.yaml`,"Creates the Operator group in the openshift-logging namespace, allowing the Cluster Logging Operator to watch and manage resources.",Yes +Cluster logging,`ClusterLogServiceAccount.yaml`,Configures the cluster logging service account.,Yes +Cluster logging,`ClusterLogServiceAccountAuditBinding.yaml`,Grants the collect-audit-logs cluster role to the logs collector service account.,Yes +Cluster logging,`ClusterLogServiceAccountInfrastructureBinding.yaml`,Allows the collector service account to collect logs from infrastructure resources.,Yes +Cluster logging,`ClusterLogSubscription.yaml`,Creates a subscription resource for the Cluster Logging Operator with manual approval for install plans.,Yes +Disconnected configuration,`catalog-source.yaml`,Defines a disconnected Red Hat Operators catalog.,No +Disconnected configuration,`icsp.yaml`,Defines a list of mirrored repository digests for the disconnected registry.,No +Disconnected configuration,`operator-hub.yaml`,Defines an OperatorHub configuration which disables all default sources.,No +Monitoring and observability,`monitoring-config-cm.yaml`,Configuring storage and retention for Prometheus and Alertmanager.,Yes +Power management,`PerformanceProfile.yaml`,"Defines a performance profile resource, specifying CPU isolation, hugepages configuration, and workload hints for performance optimization on selected nodes.",No +|==== diff --git a/modules/telco-core-crs-networking.adoc b/modules/telco-core-crs-networking.adoc index 09f9e6535a57..c6bc9e3e5d4e 100644 --- a/modules/telco-core-crs-networking.adoc +++ b/modules/telco-core-crs-networking.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="networking-crs_{context}"] @@ -9,27 +9,27 @@ .Networking CRs [cols="4*", options="header", format=csv] |==== -Component,Reference CR,Optional,New in this release -Baseline,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-network-yaml[Network.yaml],Yes,No -Baseline,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-networkattachmentdefinition-yaml[networkAttachmentDefinition.yaml],Yes,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-addr-pool-yaml[addr-pool.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-bfd-profile-yaml[bfd-profile.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-bgp-advr-yaml[bgp-advr.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-bgp-peer-yaml[bgp-peer.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-community-yaml[community.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-metallb-yaml[metallb.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-metallbns-yaml[metallbNS.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-metallbopergroup-yaml[metallbOperGroup.yaml],No,No -Load balancer,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-metallbsubscription-yaml[metallbSubscription.yaml],No,No -Multus - Tap CNI for rootless DPDK pods,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-mc_rootless_pods_selinux-yaml[mc_rootless_pods_selinux.yaml],No,No -NMState Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nmstate-yaml[NMState.yaml],No,No -NMState Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nmstatens-yaml[NMStateNS.yaml],No,No -NMState Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nmstateopergroup-yaml[NMStateOperGroup.yaml],No,No -NMState Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nmstatesubscription-yaml[NMStateSubscription.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovnetwork-yaml[sriovNetwork.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovnetworknodepolicy-yaml[sriovNetworkNodePolicy.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovoperatorconfig-yaml[SriovOperatorConfig.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovsubscription-yaml[SriovSubscription.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovsubscriptionns-yaml[SriovSubscriptionNS.yaml],No,No -SR-IOV Network Operator,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sriovsubscriptionopergroup-yaml[SriovSubscriptionOperGroup.yaml],No,No +Component,Reference CR,Description,Optional +Baseline,`Network.yaml`,"Configures the default cluster network, specifying OVN Kubernetes settings like routing via the host. It also allows the definition of additional networks, including custom CNI configurations, and enables the use of MultiNetworkPolicy CRs for network policies across multiple networks.",No +Baseline,`networkAttachmentDefinition.yaml`,Optional. Defines a NetworkAttachmentDefinition resource specifying network configuration details such as node selector and CNI configuration.,Yes +Load Balancer,`addr-pool.yaml`,Configures MetalLB to manage a pool of IP addresses with auto-assign enabled for dynamic allocation of IPs from the specified range.,No +Load Balancer,`bfd-profile.yaml`,"Configures bidirectional forwarding detection (BFD) with customized intervals, detection multiplier, and modes for quicker network fault detection and load balancing failover.",No +Load Balancer,`bgp-advr.yaml`,"Defines a BGP advertisement resource for MetalLB, specifying how an IP address pool is advertised to BGP peers. This enables fine-grained control over traffic routing and announcements.",No +Load Balancer,`bgp-peer.yaml`,"Defines a BGP peer in MetalLB, representing a BGP neighbor for dynamic routing.",No +Load Balancer,`community.yaml`,"Defines a MetalLB community, which groups one or more BGP communities under a named resource. Communities can be applied to BGP advertisements to control routing policies and change traffic routing.",No +Load Balancer,`metallb.yaml`,Defines the MetalLB resource in the cluster.,No +Load Balancer,`metallbNS.yaml`,Defines the metallb-system namespace in the cluster.,No +Load Balancer,`metallbOperGroup.yaml`,Defines the Operator group for the MetalLB Operator.,No +Load Balancer,`metallbSubscription.yaml`,Creates a subscription resource for the metallb Operator with manual approval for install plans.,No +Multus - Tap CNI for rootless DPDK pods,`mc_rootless_pods_selinux.yaml`,Configures a MachineConfig resource which sets an SELinux boolean for the tap CNI plugin on worker nodes.,Yes +NMState Operator,`NMState.yaml`,Defines an NMState resource that is used by the NMState Operator to manage node network configurations.,No +NMState Operator,`NMStateNS.yaml`,Creates the NMState Operator namespace.,No +NMState Operator,`NMStateOperGroup.yaml`,"Creates the Operator group in the openshift-nmstate namespace, allowing the NMState Operator to watch and manage resources.",No +NMState Operator,`NMStateSubscription.yaml`,"Creates a subscription for the NMState Operator, managed through OLM.",No +SR-IOV Network Operator,`sriovNetwork.yaml`,"Defines an SR-IOV network specifying network capabilities, IP address management (ipam), and the associated network namespace and resource.",No +SR-IOV Network Operator,`sriovNetworkNodePolicy.yaml`,"Configures network policies for SR-IOV devices on specific nodes, including customization of device selection, VF allocation (numVfs), node-specific settings (nodeSelector), and priorities.",No +SR-IOV Network Operator,`SriovOperatorConfig.yaml`,"Configures various settings for the SR-IOV Operator, including enabling the injector and Operator webhook, disabling pod draining, and defining the node selector for the configuration daemon.",No +SR-IOV Network Operator,`SriovSubscription.yaml`,"Creates a subscription for the SR-IOV Network Operator, managed through OLM.",No +SR-IOV Network Operator,`SriovSubscriptionNS.yaml`,Creates the SR-IOV Network Operator subscription namespace.,No +SR-IOV Network Operator,`SriovSubscriptionOperGroup.yaml`,"Creates the Operator group for the SR-IOV Network Operator, allowing it to watch and manage resources in the target namespace.",No |==== diff --git a/modules/telco-core-crs-node-configuration.adoc b/modules/telco-core-crs-node-configuration.adoc index 5a44755c0ade..c104a9df98f0 100644 --- a/modules/telco-core-crs-node-configuration.adoc +++ b/modules/telco-core-crs-node-configuration.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="node-configuration-crs_{context}"] @@ -9,12 +9,12 @@ .Node configuration CRs [cols="4*", options="header", format=csv] |==== -Component,Reference CR,Optional,New in this release -Additional kernel modules,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-control-plane-load-kernel-modules-yaml[control-plane-load-kernel-modules.yaml],Yes,No -Additional kernel modules,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sctp_module_mc-yaml[sctp_module_mc.yaml],Yes,No -Additional kernel modules,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-worker-load-kernel-modules-yaml[worker-load-kernel-modules.yaml],Yes,No -Container mount namespace hiding,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-mount_namespace_config_master-yaml[mount_namespace_config_master.yaml],No,Yes -Container mount namespace hiding,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-mount_namespace_config_worker-yaml[mount_namespace_config_worker.yaml],No,Yes -Kdump enable,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-kdump-master-yaml[kdump-master.yaml],No,Yes -Kdump enable,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-kdump-worker-yaml[kdump-worker.yaml],No,Yes +Component,Reference CR,Description,Optional +Additional kernel modules,`control-plane-load-kernel-modules.yaml`,Optional. Configures the kernel modules for control plane nodes.,No +Additional kernel modules,`sctp_module_mc.yaml`,Optional. Loads the SCTP kernel module in worker nodes.,No +Additional kernel modules,`worker-load-kernel-modules.yaml`,Optional. Configures kernel modules for worker nodes.,No +Container mount namespace hiding,`mount_namespace_config_master.yaml`,Configures a mount namespace for sharing container-specific mounts between kubelet and CRI-O on control plane nodes.,No +Container mount namespace hiding,`mount_namespace_config_worker.yaml`,Configures a mount namespace for sharing container-specific mounts between kubelet and CRI-O on worker nodes.,No +Kdump enable,`kdump-master.yaml`,Configures kdump crash reporting on master nodes.,No +Kdump enable,`kdump-worker.yaml`,Configures kdump crash reporting on worker nodes.,No |==== diff --git a/modules/telco-core-crs-resource-tuning.adoc b/modules/telco-core-crs-resource-tuning.adoc index c6aefdb9d3b7..0131759f5c50 100644 --- a/modules/telco-core-crs-resource-tuning.adoc +++ b/modules/telco-core-crs-resource-tuning.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="resource-tuning-crs_{context}"] @@ -9,6 +9,6 @@ .Resource tuning CRs [cols="4*", options="header", format=csv] |==== -Component,Reference CR,Optional,New in this release -System reserved capacity,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-control-plane-system-reserved-yaml[control-plane-system-reserved.yaml],Yes,No +Component,Reference CR,Description,Optional +System reserved capacity,`control-plane-system-reserved.yaml`,"Optional. Configures kubelet, enabling auto-sizing reserved resources for the control plane node pool.",No |==== diff --git a/modules/telco-core-crs-scheduling.adoc b/modules/telco-core-crs-scheduling.adoc index d3ae65265f96..2793905bad0f 100644 --- a/modules/telco-core-crs-scheduling.adoc +++ b/modules/telco-core-crs-scheduling.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="scheduling-crs_{context}"] @@ -9,11 +9,11 @@ .Scheduling CRs [cols="4*", options="header", format=csv] |==== -Component,Reference CR,Optional,New in this release -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nrop-yaml[nrop.yaml],No,No -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nropsubscription-yaml[NROPSubscription.yaml],No,No -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nropsubscriptionns-yaml[NROPSubscriptionNS.yaml],No,No -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-nropsubscriptionopergroup-yaml[NROPSubscriptionOperGroup.yaml],No,No -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-sched-yaml[sched.yaml],No,No -NUMA-aware scheduler,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-scheduler-yaml[Scheduler.yaml],No,No +Component,Reference CR,Description,Optional +NUMA-aware scheduler,`nrop.yaml`,"Enables the NUMA Resources Operator, aligning workloads with specific NUMA node configurations. Required for clusters with multi-NUMA nodes.",No +NUMA-aware scheduler,`NROPSubscription.yaml`,"Creates a subscription for the NUMA Resources Operator, managed through OLM. Required for clusters with multi-NUMA nodes.",No +NUMA-aware scheduler,`NROPSubscriptionNS.yaml`,Creates the NUMA Resources Operator subscription namespace. Required for clusters with multi-NUMA nodes.,No +NUMA-aware scheduler,`NROPSubscriptionOperGroup.yaml`,"Creates the Operator group in the numaresources-operator namespace, allowing the NUMA Resources Operator to watch and manage resources. Required for clusters with multi-NUMA nodes.",No +NUMA-aware scheduler,`sched.yaml`,Configures a topology-aware scheduler in the cluster that can handle NUMA aware scheduling of pods across nodes.,No +NUMA-aware scheduler,`Scheduler.yaml`,Configures control plane nodes as non-schedulable for workloads.,No |==== diff --git a/modules/telco-core-crs-storage.adoc b/modules/telco-core-crs-storage.adoc index c8c6f5ef16da..7ae3f0531103 100644 --- a/modules/telco-core-crs-storage.adoc +++ b/modules/telco-core-crs-storage.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="storage-crs_{context}"] @@ -9,10 +9,10 @@ .Storage CRs [cols="4*", options="header", format=csv] |==== -Component,Reference CR,Optional,New in this release -External ODF configuration,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-01-rook-ceph-external-cluster-details.secret-yaml[01-rook-ceph-external-cluster-details.secret.yaml],No,No -External ODF configuration,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-02-ocs-external-storagecluster-yaml[02-ocs-external-storagecluster.yaml],No,No -External ODF configuration,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-odfns-yaml[odfNS.yaml],No,No -External ODF configuration,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-odfopergroup-yaml[odfOperGroup.yaml],No,No -External ODF configuration,xref:../../telco_ref_design_specs/core/telco-core-ref-crs.adoc#telco-core-odfsubscription-yaml[odfSubscription.yaml],No,No +Component,Reference CR,Description,Optional +External ODF configuration,`01-rook-ceph-external-cluster-details.secret.yaml`,Defines a Secret resource containing base64-encoded configuration data for an external Ceph cluster in the openshift-storage namespace.,No +External ODF configuration,`02-ocs-external-storagecluster.yaml`,Defines an OpenShift Container Storage (OCS) storage resource which configures the cluster to use an external storage back end.,No +External ODF configuration,`odfNS.yaml`,Creates the monitored openshift-storage namespace for the OpenShift Data Foundation Operator.,No +External ODF configuration,`odfOperGroup.yaml`,"Creates the Operator group in the openshift-storage namespace, allowing the OpenShift Data Foundation Operator to watch and manage resources.",No +External ODF configuration,`odfSubscription.yaml`,"Creates the subscription for the OpenShift Data Foundation Operator in the openshift-storage namespace.",No |==== diff --git a/modules/telco-core-disconnected-environment.adoc b/modules/telco-core-disconnected-environment.adoc new file mode 100644 index 000000000000..6f4a7e3c81ca --- /dev/null +++ b/modules/telco-core-disconnected-environment.adoc @@ -0,0 +1,26 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-disconnected-environment_{context}"] += Disconnected environment + +New in this release:: +* No reference design updates in this release + +Descrption:: +Telco core clusters are expected to be installed in networks without direct access to the internet. +All container images needed to install, configure, and operate the cluster must be available in a disconnected registry. +This includes {product-title} images, Day 2 OLM Operator images, and application workload images. +The use of a disconnected environment provides multiple benefits, including: + +* Security - limiting access to the cluster +* Curated content – the registry is populated based on curated and approved updates for clusters + +Limits and requirements:: +* A unique name is required for all custom `CatalogSource` resources. +Do not reuse the default catalog names. + +Engineering considerations:: +* A valid time source must be configured as part of cluster installation diff --git a/modules/telco-core-gitops-operator-and-ztp-plugins.adoc b/modules/telco-core-gitops-operator-and-ztp-plugins.adoc new file mode 100644 index 000000000000..2a4e7ff4d952 --- /dev/null +++ b/modules/telco-core-gitops-operator-and-ztp-plugins.adoc @@ -0,0 +1,57 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-gitops-operator-and-ztp-plugins_{context}"] += GitOps Operator and GitOps ZTP plugins + +New in this release:: +* No reference design updates in this release + +Description:: ++ +-- +The GitOps Operator provides a GitOps driven infrastructure for managing cluster deployment and configuration. +Cluster definitions and configuration are maintained in a Git repository. + +ZTP plugins provide support for generating `Installation` CRs from `SiteConfig` CRs and automatically wrapping configuration CRs in policies based on {rh-rhacm} `PolicyGenerator` CRs. + +The SiteConfig Operator provides improved support for generation of `Installation` CRs from `ClusterInstance` CRs. + +[IMPORTANT] +==== +Where possible, use `ClusterInstance` CRs for cluster installation instead of the `SiteConfig` with {ztp} plugin method. +==== + +You should structure the Git repository according to release version, with all necessary artifacts (`SiteConfig`, `ClusterInstance`, `PolicyGenerator`, and `PolicyGenTemplate`, and supporting reference CRs) included. +This enables deploying and managing multiple versions of the OpenShift platform and configuration versions to clusters simultaneously and through upgrades. + +The recommended Git structure keeps reference CRs in a directory separate from customer or partner provided content. +This means that you can import reference updates by simply overwriting existing content. +Customer or partner-supplied CRs can be provided in a parallel directory to the reference CRs for easy inclusion in the generated configuration policies. +-- + +Limits and requirements:: +* Each ArgoCD application supports up to 300 nodes. +Multiple ArgoCD applications can be used to achieve the maximum number of clusters supported by a single hub cluster. +* The `SiteConfig` CR must use the `extraManifests.searchPaths` field to reference the reference manifests. ++ +[NOTE] +==== +Since {product-title} 4.15, the `spec.extraManifestPath` field is deprecated. +==== + +Engineering considerations:: +* Set the `MachineConfigPool` (`mcp`) CR `paused` field to true during a cluster upgrade maintenance window and set the `maxUnavailable` field to the maximum tolerable value. +This prevents multiple cluster node reboots during upgrade, which results in a shorter overall upgrade. +When you unpause the `mcp` CR, all the configuration changes are applied with a single reboot. ++ +[NOTE] +==== +During installation, custom `mcp` CRs can be paused along with setting `maxUnavailable` to 100% to improve installation times. +==== + +* To avoid confusion or unintentional overwriting when updating content, you should use unique and distinguishable names for custom CRs in the `reference-crs/` directory under core-overlay and extra manifests in Git. +* The `SiteConfig` CR allows multiple extra-manifest paths. +When file names overlap in multiple directory paths, the last file found in the directory order list takes precedence. diff --git a/modules/telco-core-host-firmware-and-boot-loader-configuration.adoc b/modules/telco-core-host-firmware-and-boot-loader-configuration.adoc new file mode 100644 index 000000000000..57d262789d0c --- /dev/null +++ b/modules/telco-core-host-firmware-and-boot-loader-configuration.adoc @@ -0,0 +1,20 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-host-firmware-and-boot-loader-configuration_{context}"] += Host firmware and boot loader configuration + +New in this release:: +* No reference design updates in this release + +Engineering considerations:: +// https://issues.redhat.com/browse/CNF-11806 +* Enabling secure boot is the recommended configuration. ++ +[NOTE] +==== +When secure boot is enabled, only signed kernel modules are loaded by the kernel. +Out-of-tree drivers are not supported. +==== diff --git a/modules/telco-core-load-balancer.adoc b/modules/telco-core-load-balancer.adoc index dd25306b0999..50d50c0097f4 100644 --- a/modules/telco-core-load-balancer.adoc +++ b/modules/telco-core-load-balancer.adoc @@ -1,38 +1,40 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-load-balancer_{context}"] = Load balancer New in this release:: -//CNF-11914 -* In {product-title} 4.17 or later, `frr-k8s` is now the default and fully supported Border Gateway Protocol (BGP) backend. -The deprecated `frr` BGP mode is still available. -You should upgrade clusters to use the `frr-k8s` backend. - -Description:: -MetalLB is a load-balancer implementation that uses standard routing protocols for bare-metal clusters. It enables a Kubernetes service to get an external IP address which is also added to the host network for the cluster. +// https://issues.redhat.com/browse/CNF-14150 +* FRR-K8s is now available under the Cluster Network Operator. + -[NOTE] +[IMPORTANT] ==== -Some use cases might require features not available in MetalLB, for example stateful load balancing. -Where necessary, use an external third party load balancer. -Selection and configuration of an external load balancer is outside the scope of this document. -When you use an external third party load balancer, ensure that it meets all performance and resource utilization requirements. +If you have custom `FRRConfiguration` CRs in the `metallb-system` namespace, you must move them under the `openshift-network-operator` namespace. ==== -Limits and requirements:: +Description:: +MetalLB is a load-balancer implementation for bare metal Kubernetes clusters that uses standard routing protocols. +It enables a Kubernetes service to get an external IP address which is also added to the host network for the cluster. +The MetalLB Operator deploys and manages the lifecycle of a MetalLB instance in a cluster. +Some use cases might require features not available in MetalLB, such as stateful load balancing. +Where necessary, you can use an external third party load balancer. +Selection and configuration of an external load balancer is outside the scope of this specification. +When an external third-party load balancer is used, the integration effort must include enough analysis to ensure all performance and resource utilization requirements are met. -* Stateful load balancing is not supported by MetalLB. An alternate load balancer implementation must be used if this is a requirement for workload CNFs. -* The networking infrastructure must ensure that the external IP address is routable from clients to the host network for the cluster. +Limits and requirements:: +* Stateful load balancing is not supported by MetalLB. +An alternate load balancer implementation must be used if this is a requirement for workload CNFs. +* You must ensure that the external IP address is routable from clients to the host network for the cluster. Engineering considerations:: -* MetalLB is used in BGP mode only for core use case models. -* For core use models, MetalLB is supported with only the OVN-Kubernetes network provider used in local gateway mode. See `routingViaHost` in the "Cluster Network Operator" section. -* BGP configuration in MetalLB varies depending on the requirements of the network and peers. -* Address pools can be configured as needed, allowing variation in addresses, aggregation length, auto assignment, and other relevant parameters. -* MetalLB uses BGP for announcing routes only. +* MetalLB is used in BGP mode only for telco core use models. +* For telco core use models, MetalLB is supported only with the OVN-Kubernetes network provider used in local gateway mode. +See `routingViaHost` in "Cluster Network Operator". +* BGP configuration in MetalLB is expected to vary depending on the requirements of the network and peers. +** You can configure address pools with variations in addresses, aggregation length, auto assignment, and so on. +** MetalLB uses BGP for announcing routes only. Only the `transmitInterval` and `minimumTtl` parameters are relevant in this mode. -Other parameters in the BFD profile should remain close to the default settings. Shorter values might lead to errors and impact performance. +Other parameters in the BFD profile should remain close to the defaults as shorter values can lead to false negatives and affect performance. diff --git a/modules/telco-core-logging.adoc b/modules/telco-core-logging.adoc index 445f802305ce..2f4a76306000 100644 --- a/modules/telco-core-logging.adoc +++ b/modules/telco-core-logging.adoc @@ -1,21 +1,22 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-logging_{context}"] = Logging New in this release:: -* Cluster Logging Operator 6.0 is new in this release. -Update your existing implementation to adapt to the new version of the API. +* No reference design updates in this release Description:: -The Cluster Logging Operator enables collection and shipping of logs off the node for remote archival and analysis. The reference configuration ships audit and infrastructure logs to a remote archive by using Kafka. +The Cluster Logging Operator enables collection and shipping of logs off the node for remote archival and analysis. +The reference configuration uses Kafka to ship audit and infrastructure logs to a remote archive. Limits and requirements:: Not applicable Engineering considerations:: * The impact of cluster CPU use is based on the number or size of logs generated and the amount of log filtering configured. -* The reference configuration does not include shipping of application logs. Inclusion of application logs in the configuration requires evaluation of the application logging rate and sufficient additional CPU resources allocated to the reserved set. +* The reference configuration does not include shipping of application logs. +The inclusion of application logs in the configuration requires you to evaluate the application logging rate and have sufficient additional CPU resources allocated to the reserved set. diff --git a/modules/telco-core-monitoring.adoc b/modules/telco-core-monitoring.adoc index fcfd25a0488a..716965ef4660 100644 --- a/modules/telco-core-monitoring.adoc +++ b/modules/telco-core-monitoring.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-monitoring_{context}"] @@ -10,16 +10,24 @@ New in this release:: * No reference design updates in this release Description:: -The {cmo-first} is included by default in {product-title} and provides monitoring (metrics, dashboards, and alerting) for the platform components and optionally user projects as well. + -[NOTE] -==== -The default handling of pod CPU and memory metrics is based on upstream Kubernetes `cAdvisor` and makes a tradeoff that prefers handling of stale data over metric accuracy. This leads to spiky data that will create false triggers of alerts over user-specified thresholds. {product-title} supports an opt-in dedicated service monitor feature creating an additional set of pod CPU and memory metrics that do not suffer from the spiky behavior. -For additional information, see link:https://access.redhat.com/solutions/7012719[Dedicated Service Monitors - Questions and Answers]. -==== +-- +The Cluster Monitoring Operator (CMO) is included by default in {product-title} and provides monitoring (metrics, dashboards, and alerting) for the platform components and optionally user projects. +You can customize the default log retention period, custom alert rules, and so on. +The default handling of pod CPU and memory metrics, based on upstream Kubernetes and cAdvisor, makes a tradeoff favoring stale data over metric accuracy. +This leads to spikes in reporting, which can create false alerts, depending on the user-specified thresholds. +{product-title} supports an opt-in Dedicated Service Monitor feature that creates an additional set of pod CPU and memory metrics that do not suffer from this behavior. +For more information, see link:https://access.redhat.com/solutions/7012719[Dedicated Service Monitors - Questions and Answers (Red Hat Knowledgebase)]. + +In addition to the default configuration, the following metrics are expected to be configured for telco core clusters: + +* Pod CPU and memory metrics and alerts for user workloads +-- Limits and requirements:: -* Monitoring configuration must enable the dedicated service monitor feature for accurate representation of pod metrics +* You must enable the Dedicated Service Monitor feature to represent pod metrics accurately. Engineering considerations:: -* You configure the Prometheus retention period. The value used is a tradeoff between operational requirements for maintaining historical data on the cluster against CPU and storage resources. Longer retention periods increase the need for storage and require additional CPU to manage the indexing of data. +* The Prometheus retention period is specified by the user. +The value used is a tradeoff between operational requirements for maintaining historical data on the cluster against CPU and storage resources. +Longer retention periods increase the need for storage and require additional CPU to manage data indexing. diff --git a/modules/telco-core-networking.adoc b/modules/telco-core-networking.adoc new file mode 100644 index 000000000000..f3a7224ef2a4 --- /dev/null +++ b/modules/telco-core-networking.adoc @@ -0,0 +1,61 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-networking_{context}"] += Networking + +The following diagram describes the telco core reference design networking configuration. + +.Telco core reference design networking configuration +image::openshift-telco-core-rds-networking.png[Overview of the telco core reference design networking configuration] + +New in this release:: ++ +-- +// https://issues.redhat.com/browse/CNF-12678 +* Support for disabling vendor plugins in the SR-IOV Operator + +// https://issues.redhat.com/browse/CNF-13768 +* link:https://access.redhat.com/articles/7090422[New knowledge base article on creating custom node firewall rules] + +// https://issues.redhat.com/browse/CNF-13981 +* Extended telco core RDS validation with MetalLB and EgressIP telco QE validation + +// https://issues.redhat.com/browse/CNF-14150 +* FRR-K8s is now available under the Cluster Network Operator. ++ +[NOTE] +==== +If you have custom `FRRConfiguration` CRs in the `metallb-system` namespace, you must move them under the `openshift-network-operator` namespace. +==== +-- + +Description:: ++ +-- +* The cluster is configured for dual-stack IP (IPv4 and IPv6). +* The validated physical network configuration consists of two dual-port NICs. +One NIC is shared among the primary CNI (OVN-Kubernetes) and IPVLAN and MACVLAN traffic, while the second one is dedicated to SR-IOV VF-based pod traffic. +* A Linux bonding interface (`bond0`) is created in active-active IEEE 802.3ad LACP mode with the two NIC ports attached. +The top-of-rack networking equipment must support and be configured for multi-chassis link aggregation (mLAG) technology. +* VLAN interfaces are created on top of `bond0`, including for the primary CNI. +* Bond and VLAN interfaces are created at cluster install time during the network configuration stage of the installation. +Except for the `vlan0` VLAN used by the primary CNI, all other VLANs can be created during Day 2 activities with the Kubernetes NMstate Operator. +* MACVLAN and IPVLAN interfaces are created with their corresponding CNIs. +They do not share the same base interface. +For more information, see "Cluster Network Operator". +* SR-IOV VFs are managed by the SR-IOV Network Operator. +* To ensure consistent source IP addresses for pods behind a LoadBalancer Service, configure an `EgressIP` CR and specify the `podSelector` parameter. +* You can implement service traffic separation by doing the following: +.. Configure VLAN interfaces and specific kernel IP routes on the nodes using `NodeNetworkConfigurationPolicy` CRs. +.. Create a MetalLB `BGPPeer` CR for each VLAN to establish peering with the remote BGP router. +.. Define a MetalLB `BGPAdvertisement` CR to specify which IP address pools should be advertised to a selected list of `BGPPeer` resources. ++ +The following diagram illustrates how specific service IP addresses are advertised to the outside via specific VLAN interfaces. +Services routes are defined in `BGPAdvertisement` CRs and configured with values for `IPAddressPool1` and `BGPPeer1` fields. +-- + +.Telco core reference design MetalLB service separation +image::openshift-telco-core-rds-metallb-service-separation.png[Telco core reference design MetalLB service separation] diff --git a/modules/telco-core-nmstate-operator.adoc b/modules/telco-core-nmstate-operator.adoc new file mode 100644 index 000000000000..b9dad0a1a284 --- /dev/null +++ b/modules/telco-core-nmstate-operator.adoc @@ -0,0 +1,23 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-nmstate-operator_{context}"] += NMState Operator + +New in this release:: +* No reference design updates in this release + +Description:: +The Kubernetes NMState Operator provides a Kubernetes API for performing state-driven network configuration across cluster nodes. +It enables network interface configurations, static IPs and DNS, VLANs, trunks, bonding, static routes, MTU, and enabling promiscuous mode on the secondary interfaces. +The cluster nodes periodically report on the state of each node's network interfaces to the API server. + +Limits and requirements:: +Not applicable + +Engineering considerations:: +* Initial networking configuration is applied using `NMStateConfig` content in the installation CRs. +The NMState Operator is used only when required for network updates. +* When SR-IOV virtual functions are used for host networking, the NMState Operator (via `nodeNetworkConfigurationPolicy` CRs) is used to configure VF interfaces, such as VLANs and MTU. diff --git a/modules/telco-core-node-configuration.adoc b/modules/telco-core-node-configuration.adoc index dc3a4ee31448..fa365f24337d 100644 --- a/modules/telco-core-node-configuration.adoc +++ b/modules/telco-core-node-configuration.adoc @@ -1,40 +1,42 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-node-configuration_{context}"] -= Node configuration += Node Configuration New in this release:: -//CNF-12344 -//CNF-12345 -* Container mount namespace encapsulation and kdump are now available in the {rds} RDS. - -Description:: -* Container mount namespace encapsulation creates a container mount namespace that reduces system mount scanning and is visible to kubelet and CRI-O. -* kdump is an optional configuration that is enabled by default that captures debug information when a kernel panic occurs. -The reference CRs which enable kdump include an increased memory reservation based on the set of drivers and kernel modules included in the reference configuration. +* No reference design updates in this release Limits and requirements:: -* Use of kdump and container mount namespace encapsulation is made available through additional kernel modules. -You should analyze these modules to determine impact on CPU load, system performance, and ability to meet required KPIs. +* Analyze additional kernel modules to determine impact on CPU load, system performance, and ability to meet KPIs. ++ +-- +.Additional kernel modules +|==== +|Feature|Description + +|Additional kernel modules +a|Install the following kernel modules by using `MachineConfig` CRs to provide extended kernel functionality to CNFs. -Engineering considerations:: -* Install the following kernel modules with `MachineConfig` CRs. -These modules provide extended kernel functionality to cloud-native functions (CNFs). +* sctp +* ip_gre +* ip6_tables +* ip6t_REJECT +* ip6table_filter +* ip6table_mangle +* iptable_filter +* iptable_mangle +* iptable_nat +* xt_multiport +* xt_owner +* xt_REDIRECT +* xt_statistic +* xt_TCPMSS -** sctp -** ip_gre -** ip6_tables -** ip6t_REJECT -** ip6table_filter -** ip6table_mangle -** iptable_filter -** iptable_mangle -** iptable_nat -** xt_multiport -** xt_owner -** xt_REDIRECT -** xt_statistic -** xt_TCPMSS +|Container mount namespace hiding|Reduce the frequency of kubelet housekeeping and eviction monitoring to reduce CPU usage. +Creates a container mount namespace, visible to kubelet/CRI-O, to reduce system mount scanning overhead. +|Kdump enable|Optional configuration (enabled by default) +|==== +-- diff --git a/modules/telco-core-openshift-data-foundation.adoc b/modules/telco-core-openshift-data-foundation.adoc new file mode 100644 index 000000000000..3424c3612c95 --- /dev/null +++ b/modules/telco-core-openshift-data-foundation.adoc @@ -0,0 +1,22 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-openshift-data-foundation_{context}"] += Red Hat OpenShift Data Foundation + +New in this release:: +* No reference design updates in this release + +Description:: +{rh-storage-first} is a software-defined storage service for containers. +For telco core clusters, storage support is provided by {rh-storage} storage services running externally to the application workload cluster. +{rh-storage} supports separation of storage traffic using secondary CNI networks. + +Limits and requirements:: +* In an IPv4/IPv6 dual-stack networking environment, {rh-storage} uses IPv4 addressing. +For more information, see link:https://docs.redhat.com/en/documentation/red_hat_openshift_data_foundation/4.17/html/planning_your_deployment/network-requirements_rhodf#network-requirements_rhodf[Network requirements]. + +Engineering considerations:: +* {rh-storage} network traffic should be isolated from other traffic on a dedicated network, for example, by using VLAN isolation. diff --git a/modules/telco-core-power-management.adoc b/modules/telco-core-power-management.adoc index 8ba4c636b235..ca1a5f0bd456 100644 --- a/modules/telco-core-power-management.adoc +++ b/modules/telco-core-power-management.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-power-management_{context}"] @@ -12,9 +12,12 @@ New in this release:: Description:: Use the Performance Profile to configure clusters with high power mode, low power mode, or mixed mode. The choice of power mode depends on the characteristics of the workloads running on the cluster, particularly how sensitive they are to latency. +Configure the maximum latency for a low-latency pod by using the per-pod power management C-states feature. Limits and requirements:: -* Power configuration relies on appropriate BIOS configuration, for example, enabling C-states and P-states. Configuration varies between hardware vendors. +* Power configuration relies on appropriate BIOS configuration, for example, enabling C-states and P-states. +Configuration varies between hardware vendors. Engineering considerations:: -* Latency: To ensure that latency-sensitive workloads meet their requirements, you will need either a high-power configuration or a per-pod power management configuration. Per-pod power management is only available for `Guaranteed` QoS Pods with dedicated pinned CPUs. +* Latency: To ensure that latency-sensitive workloads meet requirements, you require a high-power or a per-pod power management configuration. +Per-pod power management is only available for Guaranteed QoS pods with dedicated pinned CPUs. diff --git a/modules/telco-core-rds-product-version-use-model-overview.adoc b/modules/telco-core-rds-product-version-use-model-overview.adoc new file mode 100644 index 000000000000..b2b814bed932 --- /dev/null +++ b/modules/telco-core-rds-product-version-use-model-overview.adoc @@ -0,0 +1,11 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-rds-product-version-use-model-overview_{context}"] += Telco core RDS {product-version} use model overview + +The telco core reference design specification (RDS) describes a platform that supports large-scale telco applications, including control plane functions such as signaling and aggregation. +It also includes some centralized data plane functions, such as user plane functions (UPF). +These functions generally require scalability, complex networking support, resilient software-defined storage, and support performance requirements that are less stringent and constrained than far-edge deployments such as RAN. diff --git a/modules/telco-core-red-hat-advanced-cluster-management.adoc b/modules/telco-core-red-hat-advanced-cluster-management.adoc new file mode 100644 index 000000000000..e304a1a5de54 --- /dev/null +++ b/modules/telco-core-red-hat-advanced-cluster-management.adoc @@ -0,0 +1,33 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-red-hat-advanced-cluster-management_{context}"] += Red Hat Advanced Cluster Management + +New in this release:: +* No reference design updates in this release + +Description:: ++ +-- +{rh-rhacm-first} provides Multi Cluster Engine (MCE) installation and ongoing {ztp} lifecycle management for deployed clusters. +You manage cluster configuration and upgrades declaratively by applying `Policy` custom resources (CRs) to clusters during maintenance windows. + +You apply policies with the {rh-rhacm} policy controller as managed by {cgu-operator-full}. +Configuration, upgrades, and cluster status are managed through the policy controller. + +When installing managed clusters, {rh-rhacm} applies labels and initial ignition configuration to individual nodes in support of custom disk partitioning, allocation of roles, and allocation to machine config pools. +You define these configurations with `SiteConfig` or `ClusterInstance` CRs. +-- + +Limits and requirements:: + +* Hub cluster sizing is discussed in link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes/2.11/html-single/install/index#sizing-your-cluster[Sizing your cluster]. + +* {rh-rhacm} scaling limits are described in link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes/2.11/html-single/install/index#performance-and-scalability[Performance and Scalability]. + +Engineering considerations:: +* When managing multiple clusters with unique content per installation, site, or deployment, using {rh-rhacm} hub templating is strongly recommended. +{rh-rhacm} hub templating allows you to apply a consistent set of policies to clusters while providing for unique values per installation. diff --git a/modules/telco-core-scalability.adoc b/modules/telco-core-scalability.adoc index 00a4ef9034bd..0d9bf84ea0a4 100644 --- a/modules/telco-core-scalability.adoc +++ b/modules/telco-core-scalability.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-scalability_{context}"] @@ -9,5 +9,8 @@ New in this release:: * No reference design updates in this release +Description:: +Scaling of workloads is described in "Application workloads". + Limits and requirements:: -* Cluster should scale to at least 120 nodes. +* Cluster can scale to at least 120 nodes. diff --git a/modules/telco-core-scheduling.adoc b/modules/telco-core-scheduling.adoc index caf1e1cdec4c..fe8bfe7c05fd 100644 --- a/modules/telco-core-scheduling.adoc +++ b/modules/telco-core-scheduling.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-scheduling_{context}"] @@ -10,17 +10,31 @@ New in this release:: * No reference design updates in this release Description:: -* The scheduler is a cluster-wide component responsible for selecting the right node for a given workload. It is a core part of the platform and does not require any specific configuration in the common deployment scenarios. However, there are few specific use cases described in the following section. ++ +-- +The scheduler is a cluster-wide component responsible for selecting the correct node for a given workload. +It is a core part of the platform and does not require any specific configuration in the common deployment scenarios. +However, a few specific use cases are described in the following section. + NUMA-aware scheduling can be enabled through the NUMA Resources Operator. For more information, see "Scheduling NUMA-aware workloads". +-- Limits and requirements:: -* The default scheduler does not understand the NUMA locality of workloads. It only knows about the sum of all free resources on a worker node. This might cause workloads to be rejected when scheduled to a node with the topology manager policy set to `single-numa-node` or `restricted`. -** For example, consider a pod requesting 6 CPUs and being scheduled to an empty node that has 4 CPUs per NUMA node. The total allocatable capacity of the node is 8 CPUs and the scheduler will place the pod there. The node local admission will fail, however, as there are only 4 CPUs available in each of the NUMA nodes. -** All clusters with multi-NUMA nodes are required to use the NUMA Resources Operator. Use the `machineConfigPoolSelector` field in the `KubeletConfig` CR to select all nodes where NUMA aligned scheduling is needed. -* All machine config pools must have consistent hardware configuration for example all nodes are expected to have the same NUMA zone count. +* The default scheduler does not understand the NUMA locality of workloads. +It only knows about the sum of all free resources on a worker node. +This might cause workloads to be rejected when scheduled to a node with the topology manager policy set to `single-numa-node` or `restricted`. +For more information, see "Topology Manager policies". +** For example, consider a pod requesting 6 CPUs that is scheduled to an empty node that has 4 CPUs per NUMA node. +The total allocatable capacity of the node is 8 CPUs. The scheduler places the pod on the empty node. +The node local admission fails, as there are only 4 CPUs available in each of the NUMA nodes. +* All clusters with multi-NUMA nodes are required to use the NUMA Resources Operator. +See "Installing the NUMA Resources Operator" for more information. +Use the `machineConfigPoolSelector` field in the `KubeletConfig` CR to select all nodes where NUMA aligned scheduling is required. +* All machine config pools must have consistent hardware configuration. +For example, all nodes are expected to have the same NUMA zone count. Engineering considerations:: -* Pods might require annotations for correct scheduling and isolation. For more information on annotations, see "CPU partitioning and performance tuning". - +* Pods might require annotations for correct scheduling and isolation. +For more information about annotations, see "CPU partitioning and performance tuning". * You can configure SR-IOV virtual function NUMA affinity to be ignored during scheduling by using the `excludeTopology` field in `SriovNetworkNodePolicy` CR. diff --git a/modules/telco-core-security.adoc b/modules/telco-core-security.adoc index fb30ace2f7b5..548d7773bf2c 100644 --- a/modules/telco-core-security.adoc +++ b/modules/telco-core-security.adoc @@ -1,31 +1,69 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-security_{context}"] = Security New in this release:: -//CNF-11806 -* Secure boot host firmware setting is now recommended for telco core clusters. -For more information, see "Host firmware and boot loader configuration". +// https://issues.redhat.com/browse/CNF-13768 +* link:https://access.redhat.com/articles/7090422[New knowledgebase article on creating custom node firewall rules] Description:: -You should harden clusters against multiple attack vectors. ++ +-- +Telco customers are security conscious and require clusters to be hardened against multiple attack vectors. In {product-title}, there is no single component or feature responsible for securing a cluster. Use the following security-oriented features and configurations to secure your clusters: * **SecurityContextConstraints (SCC)**: All workload pods should be run with `restricted-v2` or `restricted` SCC. -* **Seccomp**: All pods should be run with the `RuntimeDefault` (or stronger) seccomp profile. -* **Rootless DPDK pods**: Many user-plane networking (DPDK) CNFs require pods to run with root privileges. With this feature, a conformant DPDK pod can be run without requiring root privileges. +* **Seccomp**: All pods should run with the `RuntimeDefault` (or stronger) seccomp profile. +* **Rootless DPDK pods**: Many user-plane networking (DPDK) CNFs require pods to run with root privileges. +With this feature, a conformant DPDK pod can run without requiring root privileges. Rootless DPDK pods create a tap device in a rootless pod that injects traffic from a DPDK application to the kernel. -* **Storage**: The storage network should be isolated and non-routable to other cluster networks. See the "Storage" section for additional details. +* **Storage**: The storage network should be isolated and non-routable to other cluster networks. +See the "Storage" section for additional details. + +Refer to link:https://access.redhat.com/articles/7090422[Custom nftable firewall rules in OpenShift] for a supported method of implementing custom nftables firewall rules in OpenShift cluster nodes. +This article is intended for cluster administrators who are responsible for managing network security policies in OpenShift environments. +It is crucial to carefully consider the operational implications before deploying this method, including: + +* **Early application**: The rules are applied at boot time, before the network is fully operational. +Ensure the rules don't inadvertently block essential services required during the boot process. + +* **Risk of misconfiguration**: Errors in your custom rules can lead to unintended consequences, potentially leading to performance impact or blocking legitimate traffic or isolating nodes. +Thoroughly test your rules in a non-production environment before deploying them to your main cluster. + +* **External endpoints**: OpenShift requires access to external endpoints to function. +For more information about the firewall allowlist, see "Configuring your firewall for {product-title}". +Ensure that cluster nodes are permitted access to those endpoints. + +* **Node reboot**: Unless node disruption policies are configured, applying the `MachineConfig` CR with the required firewall settings causes a node reboot. +Be aware of this impact and schedule a maintenance window accordingly. +For more information, see "Using node disruption policies to minimize disruption from machine config changes". ++ +[NOTE] +==== +Node disruption policies are available in {product-title} 4.17 and later. +==== + +* **Network flow matrix**: For more information about managing ingress traffic, see "{product-title} network flow matrix". +You can restrict ingress traffic to essential flows to improve network security. +The matrix provides insights into base cluster services but excludes traffic generated by Day-2 Operators. + +* **Cluster version updates and upgrades**: Exercise caution when updating or upgrading OpenShift clusters. +Recent changes to the platform's firewall requirements might require adjustments to network port permissions. +Although the documentation provides guidelines, note that these requirements can evolve over time. +To minimize disruptions, you should test any updates or upgrades in a staging environment before applying them in production. +This helps you to identify and address potential compatibility issues related to firewall configuration changes. +-- Limits and requirements:: -* Rootless DPDK pods requires the following additional configuration steps: -** Configure the TAP plugin with the `container_t` SELinux context. -** Enable the `container_use_devices` SELinux boolean on the hosts. +* Rootless DPDK pods requires the following additional configuration: +** Configure the `container_t` SELinux context for the tap plugin. +** Enable the `container_use_devices` SELinux boolean for the cluster host. Engineering considerations:: -* For rootless DPDK pod support, the SELinux boolean `container_use_devices` must be enabled on the host for the TAP device to be created. This introduces a security risk that is acceptable for short to mid-term use. Other solutions will be explored. +* For rootless DPDK pod support, enable the SELinux `container_use_devices` boolean on the host to allow the tap device to be created. +This introduces an acceptable security risk. diff --git a/modules/telco-core-service-mesh.adoc b/modules/telco-core-service-mesh.adoc index ac37352d107f..315bbb3a3650 100644 --- a/modules/telco-core-service-mesh.adoc +++ b/modules/telco-core-service-mesh.adoc @@ -1,17 +1,13 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-service-mesh_{context}"] -= Service Mesh += Service mesh Description:: -{rds-caps} cloud-native functions (CNFs) typically require a service mesh implementation. -+ -[NOTE] -==== +Telco core cloud-native functions (CNFs) typically require a service mesh implementation. Specific service mesh features and performance requirements are dependent on the application. The selection of service mesh implementation and configuration is outside the scope of this documentation. You must account for the impact of service mesh on cluster resource usage and performance, including additional latency introduced in pod networking, in your implementation. -==== diff --git a/modules/telco-core-signaling-workloads.adoc b/modules/telco-core-signaling-workloads.adoc new file mode 100644 index 000000000000..c9042e6b5a9a --- /dev/null +++ b/modules/telco-core-signaling-workloads.adoc @@ -0,0 +1,11 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-signaling-workloads_{context}"] += Signaling workloads + +Signaling workloads typically use SCTP, REST, gRPC or similar TCP or UDP protocols. +Signaling workloads support hundreds of thousands of transactions per second (TPS) by using a secondary multus CNI configured as MACVLAN or SR-IOV interface. +These workloads can run in pods with either guaranteed or burstable QoS. diff --git a/modules/telco-core-software-stack.adoc b/modules/telco-core-software-stack.adoc index 1d2cd1a7a801..528fad6feb17 100644 --- a/modules/telco-core-software-stack.adoc +++ b/modules/telco-core-software-stack.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-software-artifacts.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-software-stack_{context}"] @@ -13,21 +13,27 @@ The Red{nbsp}Hat telco core {product-version} solution has been validated using |==== |Component |Software version +|{rh-rhacm-first} +|2.12^1^ + |Cluster Logging Operator -|6.0 +|6.1^2^ |{rh-storage} -|4.17 +|4.18 -|SR-IOV Operator -|4.17 +|SR-IOV Network Operator +|4.18 |MetalLB -|4.17 +|4.18 |NMState Operator -|4.17 +|4.18 |NUMA-aware scheduler -|4.17 +|4.18 |==== +[1] This table will be updated when the aligned {rh-rhacm} version 2.13 is released. + +[2] This table will be updated when the aligned Cluster Logging Operator 6.2 is released. diff --git a/modules/telco-core-sr-iov.adoc b/modules/telco-core-sr-iov.adoc new file mode 100644 index 000000000000..f26bc44961dc --- /dev/null +++ b/modules/telco-core-sr-iov.adoc @@ -0,0 +1,39 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-sr-iov_{context}"] += SR-IOV + +New in this release:: +// https://issues.redhat.com/browse/CNF-12678 +* You can now create virtual functions for Mellanox NICs with the SR-IOV Network Operator when secure boot is enabled in the cluster host. +Before you can create the virtual functions, you must first skip the firmware configuration for the Mellanox NIC and manually allocate the number of virtual functions in the firmware before switching the system to secure boot. + +Description:: +SR-IOV enables physical functions (PFs) to be divided into multiple virtual functions (VFs). +VFs can then be assigned to multiple pods to achieve higher throughput performance while keeping the pods isolated. +The SR-IOV Network Operator provisions and manages SR-IOV CNI, network device plugin, and other components of the SR-IOV stack. + +Limits and requirements:: +* Only certain network interfaces are supported. +See "Supported devices" for more information. + +* Enabling SR-IOV and IOMMU: the SR-IOV Network Operator automatically enables IOMMU on the kernel command line. + +* SR-IOV VFs do not receive link state updates from the PF. +If a link down detection is required, it must be done at the protocol level. + +* `MultiNetworkPolicy` CRs can be applied to `netdevice` networks only. +This is because the implementation uses iptables, which cannot manage vfio interfaces. + +Engineering considerations:: +* SR-IOV interfaces in `vfio` mode are typically used to enable additional secondary networks for applications that require high throughput or low latency. +* The `SriovOperatorConfig` CR must be explicitly created. +This CR is included in the reference configuration policies, which causes it to be created during initial deployment. +* NICs that do not support firmware updates with UEFI secure boot or kernel lockdown must be preconfigured with sufficient virtual functions (VFs) enabled to support the number of VFs required by the application workload. +For Mellanox NICs, you must disable the Mellanox vendor plugin in the SR-IOV Network Operator. +See "Configuring an SR-IOV network device" for more information. +* To change the MTU value of a VF after the pod has started, do not configure the `SriovNetworkNodePolicy` MTU field. +Instead, use the Kubernetes NMState Operator to set the MTU of the related PF. diff --git a/modules/telco-core-storage.adoc b/modules/telco-core-storage.adoc index 9e5a75d7ecc3..6f72cb429afb 100644 --- a/modules/telco-core-storage.adoc +++ b/modules/telco-core-storage.adoc @@ -1,32 +1,28 @@ // Module included in the following assemblies: // -// * scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc :_mod-docs-content-type: REFERENCE [id="telco-core-storage_{context}"] = Storage -Cloud native storage services can be provided by multiple solutions including {rh-storage} from Red Hat or third parties. - -[id="telco-core-rh-storage_{context}"] -== {rh-storage} - New in this release:: * No reference design updates in this release Description:: -{rh-storage-first} is a software-defined storage service for containers. -For {rds-caps} clusters, storage support is provided by {rh-storage} storage services running externally to the application workload cluster. - -Limits and requirements:: -* In an IPv4/IPv6 dual-stack networking environment, {rh-storage} uses IPv4 addressing. For more information, see link:https://access.redhat.com/documentation/en-us/red_hat_openshift_data_foundation/4.13/html-single/4.13_release_notes/index#support_openshift_dual_stack_with_odf_using_ipv4[Support OpenShift dual stack with {rh-storage} using IPv4]. ++ +-- +Cloud native storage services can be provided by {rh-storage-first} or other third-party solutions. -Engineering considerations:: -* {rh-storage} network traffic should be isolated from other traffic on a dedicated network, for example, by using VLAN isolation. +{rh-storage} is a Ceph-based software-defined storage solution for containers. +It provides block storage, file system storage, and on-premise object storage, which can be dynamically provisioned for both persistent and non-persistent data requirements. +Telco core applications require persistent storage. -* Other storage solutions can be used to provide persistent storage for core clusters. -+ [NOTE] ==== -The configuration and integration of these solutions is outside the scope of the {rds} RDS. Integration of the storage solution into the core cluster must include correct sizing and performance analysis to ensure the storage meets overall performance and resource utilization requirements. +All storage data might not be encrypted in flight. +To reduce risk, isolate the storage network from other cluster networks. +The storage network must not be reachable, or routable, from other cluster networks. +Only nodes directly attached to the storage network should be allowed to gain access to it. ==== +-- diff --git a/modules/telco-core-topology-aware-lifecycle-manager.adoc b/modules/telco-core-topology-aware-lifecycle-manager.adoc new file mode 100644 index 000000000000..8361c5debf34 --- /dev/null +++ b/modules/telco-core-topology-aware-lifecycle-manager.adoc @@ -0,0 +1,26 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc + +:_mod-docs-content-type: REFERENCE +[id="telco-core-topology-aware-lifecycle-manager_{context}"] += Topology Aware Lifecycle Manager + +New in this release:: +No reference design updates in this release. + +Description:: +{cgu-operator-full} is an Operator which runs only on the hub cluster. +{cgu-operator} manages how changes including cluster and Operator upgrades, configurations, and so on, are rolled out to managed clusters in the network. +{cgu-operator} has the following core features: +* Provides sequenced updates of cluster configurations and upgrades ({product-title} and Operators) as defined by cluster policies. +* Provides for deferred application of cluster updates. +* Supports progressive rollout of policy updates to sets of clusters in user configurable batches. +* Allows for per-cluster actions by adding `ztp-done` or similar user-defined labels to clusters. + +Limits and requirements:: +* Supports concurrent cluster deployments in batches of 400. + +Engineering considerations:: +* Only policies with the `ran.openshift.io/ztp-deploy-wave` annotation are applied by {cgu-operator} during initial cluster installation. +* Any policy can be remediated by {cgu-operator} under control of a user created `ClusterGroupUpgrade` CR. diff --git a/scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc b/scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc index 8148f850af31..2e163a36ba71 100644 --- a/scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc +++ b/scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc @@ -17,7 +17,7 @@ include::modules/understanding-a-reference-config.adoc[leveloffset=+1] [role="_additional-resources"] == Additional resources -* xref:../../scalability_and_performance/telco_ref_design_specs/telco-ref-design-specs-overview.adoc#telco-ref-design-overview_telco_ref_design_specs[Reference design specifications for telco 5G deployments] +* xref:../../scalability_and_performance/telco_ran_du_ref_design_specs/telco-ran-du-rds.adoc#telco-ran-du-ref-design-specs[Telco RAN DU reference design specification for {product-title}] diff --git a/scalability_and_performance/cluster-compare/using-the-cluster-compare-plugin.adoc b/scalability_and_performance/cluster-compare/using-the-cluster-compare-plugin.adoc index 825ba3184d0a..09e9251d6b79 100644 --- a/scalability_and_performance/cluster-compare/using-the-cluster-compare-plugin.adoc +++ b/scalability_and_performance/cluster-compare/using-the-cluster-compare-plugin.adoc @@ -25,4 +25,4 @@ include::modules/using-cluster-compare-telco-ref.adoc[leveloffset=+1] [id="additional-resources_{context}"] == Additional resources * xref:../../scalability_and_performance/telco_ran_du_ref_design_specs/telco-ran-du-rds.adoc#using-cluster-compare-telco_ref_ran-ref-design-crs[Comparing a cluster with the telco RAN DU reference configuration] -* xref:../../scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc#using-cluster-compare-telco_ref_ran-core-ref-design-crs[Comparing a cluster with the telco core reference configuration] +* xref:../../scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc#using-cluster-compare-telco_ref_telco-core[Comparing a cluster with the telco core reference configuration] diff --git a/scalability_and_performance/index.adoc b/scalability_and_performance/index.adoc index 6760db1a0454..18e0f2dd7b24 100644 --- a/scalability_and_performance/index.adoc +++ b/scalability_and_performance/index.adoc @@ -30,7 +30,7 @@ xref:../scalability_and_performance/recommended-performance-scale-practices/reco xref:../scalability_and_performance/telco_ran_du_ref_design_specs/telco-ran-du-rds.adoc#telco-ran-du-ref-design-specs[Telco RAN DU reference design specification for {product-title} {product-version}] -xref:../scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-overview.adoc#telco-core-cluster-service-based-architecture-and-networking-topology_core-ref-design-overview[Telco core reference design specification] +xref:../scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc#telco-core-ref-design-specs[Telco core reference design specification] [discrete] == Planning, optimization, and measurement diff --git a/scalability_and_performance/telco_ref_design_specs/core/_attributes b/scalability_and_performance/telco_core_ref_design_specs/_attributes similarity index 100% rename from scalability_and_performance/telco_ref_design_specs/core/_attributes rename to scalability_and_performance/telco_core_ref_design_specs/_attributes diff --git a/scalability_and_performance/telco_ref_design_specs/core/images b/scalability_and_performance/telco_core_ref_design_specs/images similarity index 100% rename from scalability_and_performance/telco_ref_design_specs/core/images rename to scalability_and_performance/telco_core_ref_design_specs/images diff --git a/scalability_and_performance/telco_ref_design_specs/core/modules b/scalability_and_performance/telco_core_ref_design_specs/modules similarity index 100% rename from scalability_and_performance/telco_ref_design_specs/core/modules rename to scalability_and_performance/telco_core_ref_design_specs/modules diff --git a/scalability_and_performance/telco_ref_design_specs/core/snippets b/scalability_and_performance/telco_core_ref_design_specs/snippets similarity index 100% rename from scalability_and_performance/telco_ref_design_specs/core/snippets rename to scalability_and_performance/telco_core_ref_design_specs/snippets diff --git a/scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc b/scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc new file mode 100644 index 000000000000..7325368043c0 --- /dev/null +++ b/scalability_and_performance/telco_core_ref_design_specs/telco-core-rds.adoc @@ -0,0 +1,235 @@ +:_mod-docs-content-type: ASSEMBLY +:telco-core: +[id="telco-core-ref-design-specs"] += Telco core reference design specifications +include::_attributes/common-attributes.adoc[] +:context: telco-core + +toc::[] + +The telco core reference design specification (RDS) configures an {product-title} cluster running on commodity hardware to host telco core workloads. + +include::modules/telco-core-rds-product-version-use-model-overview.adoc[leveloffset=+1] + +include::modules/telco-core-about-the-telco-core-cluster-use-model.adoc[leveloffset=+1] + +include::modules/telco-ran-core-ref-design-spec.adoc[leveloffset=+2] + +include::modules/telco-deviations-from-the-ref-design.adoc[leveloffset=+2] + +include::modules/telco-core-common-baseline-model.adoc[leveloffset=+1] + +include::modules/telco-core-cluster-common-use-model-engineering-considerations.adoc[leveloffset=+1] + +include::modules/telco-core-application-workloads.adoc[leveloffset=+2] + +include::modules/telco-core-signaling-workloads.adoc[leveloffset=+2] + +[id="telco-core-rds-components"] +== Telco core RDS components + +The following sections describe the various {product-title} components and configurations that you use to configure and deploy clusters to run telco core workloads. + +include::modules/telco-core-cpu-partitioning-and-performance-tuning.adoc[leveloffset=+2] + + +[role="_additional-resources"] +.Additional resources + +* xref:../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-create-performance-profiles[Creating a performance profile] + +* xref:../../edge_computing/ztp-reference-cluster-configuration-for-vdu.adoc#ztp-du-configuring-host-firmware-requirements_sno-configure-for-vdu[Configuring host firmware for low latency and high performance] + +* xref:../../installing/install_config/enabling-cgroup-v1.adoc#nodes-clusters-cgroups-2-install_nodes-cluster-cgroups-1[Enabling Linux cgroup v1 during installation] + +include::modules/telco-core-service-mesh.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../service_mesh/v2x/ossm-about.adoc#ossm-about[About OpenShift Service Mesh] + +include::modules/telco-core-networking.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../networking/understanding-networking.adoc#understanding-networking[Understanding networking] + +include::modules/telco-core-cluster-network-operator.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../networking/networking_operators/cluster-network-operator.adoc#nw-cluster-network-operator_cluster-network-operator[Cluster Network Operator] + +include::modules/telco-core-load-balancer.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../networking/networking_operators/metallb-operator/about-metallb.adoc#nw-metallb-when-metallb_about-metallb-and-metallb-operator[When to use MetalLB] + +include::modules/telco-core-sr-iov.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../networking/hardware_networks/about-sriov.adoc#about-sriov[About Single Root I/O Virtualization (SR-IOV) hardware networks] + +* xref:../../networking/hardware_networks/about-sriov.adoc#supported-devices_about-sriov[Supported devices] + +* xref:../../networking/hardware_networks/configuring-sriov-device.html#nw-sriov-nic-mlx-secure-boot_configuring-sriov-device[Configuring the SR-IOV Network Operator on Mellanox cards when Secure Boot is enabled] + +include::modules/telco-core-nmstate-operator.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../networking/networking_operators/k8s-nmstate-about-the-k8s-nmstate-operator.adoc#k8s-nmstate-about-the-k8s-nmstate-operator[Kubernetes NMState Operator] + +include::modules/telco-core-logging.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* link:https://docs.openshift.com/container-platform/4.17/observability/logging/logging-6.0/log6x-about.html[Logging 6.0] + +include::modules/telco-core-power-management.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../rest_api/node_apis/performanceprofile-performance-openshift-io-v2.adoc#spec-workloadhints[performance.openshift.io/v2 API reference] + +* xref:../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-configuring-power-saving-for-nodes_cnf-low-latency-perf-profile[Configuring power saving for nodes] + +* xref:../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-configuring-power-saving-for-nodes_cnf-low-latency-perf-profile[Configuring power saving for nodes that run colocated high and low priority workloads] + +include::modules/telco-core-storage.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../storage/persistent_storage/persistent-storage-ocs.adoc#red-hat-openshift-data-foundation[{rh-storage-first}] + +include::modules/telco-core-openshift-data-foundation.adoc[leveloffset=+3] + +include::modules/telco-core-additional-storage-solutions.adoc[leveloffset=+3] + +[id="telco-reference-core-deployment-components_{context}"] +=== Telco core deployment components + +The following sections describe the various {product-title} components and configurations that you use to configure the hub cluster with {rh-rhacm-first}. + +include::modules/telco-core-red-hat-advanced-cluster-management.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../edge_computing/ztp-deploying-far-edge-clusters-at-scale.adoc#about-ztp_ztp-deploying-far-edge-clusters-at-scale[Using {ztp} to provision clusters at the network far edge] + +* link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes[Red Hat Advanced Cluster Management for Kubernetes] + +include::modules/telco-core-topology-aware-lifecycle-manager.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../edge_computing/cnf-talm-for-cluster-upgrades.adoc#cnf-talm-for-cluster-updates[Updating managed clusters with the {cgu-operator-full}] + +include::modules/telco-core-gitops-operator-and-ztp-plugins.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../edge_computing/ztp-preparing-the-hub-cluster.adoc#ztp-preparing-the-ztp-git-repository-ver-ind_ztp-preparing-the-hub-cluster[Preparing the {ztp} site configuration repository for version independence] + +* xref:../../edge_computing/policygentemplate_for_ztp/ztp-advanced-policy-config.adoc#ztp-adding-new-content-to-gitops-ztp_ztp-advanced-policy-config[Adding custom content to the {ztp} pipeline] + +include::modules/telco-core-monitoring.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../observability/monitoring/about-ocp-monitoring/about-ocp-monitoring.adoc#about-ocp-monitoring[About {product-title} monitoring] + +include::modules/telco-core-scheduling.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../scalability_and_performance/cnf-numa-aware-scheduling.adoc#installing-the-numa-resources-operator_numa-aware[Installing the NUMA Resources Operator] + +* xref:../../scalability_and_performance/cnf-numa-aware-scheduling.adoc#cnf-numa-aware-scheduling[Scheduling NUMA-aware workloads] + +xref:../../scalability_and_performance/using-cpu-manager.adoc#topology_manager_policies_using-cpu-manager-and-topology_manager[Topology Manager policies] + +include::modules/telco-core-node-configuration.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../edge_computing/ztp-reference-cluster-configuration-for-vdu.adoc#ztp-sno-du-enabling-kdump_sno-configure-for-vdu[Automatic kernel crash dumps with kdump] + +* xref:../../scalability_and_performance/optimization/optimizing-cpu-usage.adoc#optimizing-cpu-usage[Optimizing CPU usage with mount namespace encapsulation] + +include::modules/telco-core-host-firmware-and-boot-loader-configuration.adoc[leveloffset=+2] + +include::modules/telco-core-disconnected-environment.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../disconnected/updating/index.adoc#about-disconnected-updates[About cluster updates in a disconnected environment] + + +include::modules/telco-core-agent-based-installer.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../installing/installing_with_agent_based_installer/installing-with-agent-based-installer.adoc#installing-with-agent-based-installer[Installing an {product-title} cluster with the Agent-based Installer] + +include::modules/telco-core-security.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../installing/install_config/configuring-firewall.adoc#configuring-firewall_configuring-firewall[Configuring your firewall for {product-title}] + +* xref:../../installing/install_config/configuring-firewall.adoc#network-flow-matrix_configuring-firewall[{product-title} network flow matrix] + +* xref:../../authentication/managing-security-context-constraints.adoc#managing-pod-security-policies[Managing security context constraints] + +* xref:../../machine_configuration/machine-config-node-disruption.adoc#machine-config-node-disruption_machine-configs-configure[Using node disruption policies to minimize disruption from machine config changes] + +include::modules/telco-core-scalability.adoc[leveloffset=+2] + +[id="telco-core-reference-configuration-crs"] +== Telco core reference configuration CRs + +Use the following custom resources (CRs) to configure and deploy {product-title} clusters with the telco core profile. +Use the CRs to form the common baseline used in all the specific use models unless otherwise indicated. + +include::modules/telco-core-rds-container.adoc[leveloffset=+2] + +include::modules/using-cluster-compare-telco-ref.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources +* xref:../../scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc#understanding-the-cluster-compare-plugin[Understanding the cluster-compare plugin] + +include::modules/telco-core-crs-node-configuration.adoc[leveloffset=+2] + +include::modules/telco-core-crs-resource-tuning.adoc[leveloffset=+2] + +include::modules/telco-core-crs-networking.adoc[leveloffset=+2] + +include::modules/telco-core-crs-scheduling.adoc[leveloffset=+2] + +include::modules/telco-core-crs-storage.adoc[leveloffset=+2] + +include::modules/telco-core-software-stack.adoc[leveloffset=+1] + +:!telco-core: diff --git a/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-overview.adoc b/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-overview.adoc deleted file mode 100644 index e72c31dfa2de..000000000000 --- a/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-overview.adoc +++ /dev/null @@ -1,14 +0,0 @@ -:_mod-docs-content-type: ASSEMBLY -:telco-core: -:context: core-ref-design-overview -include::_attributes/common-attributes.adoc[] -[id="telco-core-ref-design-overview"] -= {rds-caps} {product-version} reference design overview - -toc::[] - -The {rds} reference design specification (RDS) configures an {product-title} cluster running on commodity hardware to host {rds} workloads. - -include::modules/telco-core-cluster-service-based-architecture-and-networking-topology.adoc[leveloffset=+1] - -:!telco-core: diff --git a/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-use-cases.adoc b/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-use-cases.adoc deleted file mode 100644 index 98c7e8d073c4..000000000000 --- a/scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-use-cases.adoc +++ /dev/null @@ -1,33 +0,0 @@ -:_mod-docs-content-type: ASSEMBLY -:telco-core: -include::_attributes/common-attributes.adoc[] -[id="telco-ran-rds-overview"] -= {rds-caps} {product-version} use model overview -:context: ran-core-design-overview - -toc::[] - -{rds-caps} clusters are configured as standard three control plane clusters with worker nodes configured with the stock non real-time (RT) kernel. - -To support workloads with varying networking and performance requirements, worker nodes are segmented using `MachineConfigPool` CRs. For example, this is done to separate non-user data plane nodes from high-throughput nodes. To support the required telco operational features, the clusters have a standard set of Operator Lifecycle Manager (OLM) Day 2 Operators installed. - -The networking prerequisites for {rds} functions are diverse and encompass an array of networking attributes and performance benchmarks. -IPv6 is mandatory, with dual-stack configurations being prevalent. Certain functions demand maximum throughput and transaction rates, necessitating user plane networking support such as DPDK. Other functions adhere to conventional cloud-native patterns and can use solutions such as OVN-K, kernel networking, and load balancing. - - - -.Telco core use model architecture -image:473_OpenShift_Telco_Core_Reference_arch_1123.png[Use model architecture] - -include::modules/telco-core-ref-design-baseline-model.adoc[leveloffset=+1] - -include::modules/telco-core-ref-eng-usecase-model.adoc[leveloffset=+2] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc#telco-core-cpu-partitioning-performance-tune_core-ref-design-components[CPU partitioning and performance tuning] - -include::modules/telco-core-ref-application-workloads.adoc[leveloffset=+2] - -:!telco-core: diff --git a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc b/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc deleted file mode 100644 index d0c7b7750145..000000000000 --- a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-crs.adoc +++ /dev/null @@ -1,48 +0,0 @@ -:_mod-docs-content-type: ASSEMBLY -:telco-core: -include::_attributes/common-attributes.adoc[] -[id="telco-core-ref-du-crs"] -= {rds-caps} {product-version} reference configuration CRs -:context: ran-core-ref-design-crs - -toc::[] - -Use the following custom resources (CRs) to configure and deploy {product-title} clusters with the {rds} profile. -Use the CRs to form the common baseline used in all the specific use models unless otherwise indicated. - -include::modules/telco-core-rds-container.adoc[leveloffset=+1] - -include::modules/using-cluster-compare-telco-ref.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources -* xref:../../../scalability_and_performance/cluster-compare/understanding-the-cluster-compare-plugin.adoc#understanding-the-cluster-compare-plugin[Understanding the cluster-compare plugin] - -include::modules/telco-core-crs-networking.adoc[leveloffset=+1] - -include::modules/telco-core-crs-node-configuration.adoc[leveloffset=+1] - -include::modules/telco-core-crs-other.adoc[leveloffset=+1] - -include::modules/telco-core-crs-resource-tuning.adoc[leveloffset=+1] - -include::modules/telco-core-crs-scheduling.adoc[leveloffset=+1] - -include::modules/telco-core-crs-storage.adoc[leveloffset=+1] - -[id="telco-reference-core-use-case-yaml_{context}"] -== YAML reference - -include::modules/telco-core-yaml-ref-networking.adoc[leveloffset=+2] - -include::modules/telco-core-yaml-ref-node-configuration.adoc[leveloffset=+2] - -include::modules/telco-core-yaml-ref-other.adoc[leveloffset=+2] - -include::modules/telco-core-yaml-ref-resource-tuning.adoc[leveloffset=+2] - -include::modules/telco-core-yaml-ref-scheduling.adoc[leveloffset=+2] - -include::modules/telco-core-yaml-ref-storage.adoc[leveloffset=+2] - -:!telco-core: diff --git a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc b/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc deleted file mode 100644 index 00e79e8752a8..000000000000 --- a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc +++ /dev/null @@ -1,182 +0,0 @@ -:_mod-docs-content-type: ASSEMBLY -:telco-core: -include::_attributes/common-attributes.adoc[] -[id="telco-core-ref-components"] -= {rds-caps} reference design components -:context: core-ref-design-components - -toc::[] - -The following sections describe the various {product-title} components and configurations that you use to configure and deploy clusters to run {rds} workloads. - -include::modules/telco-core-cpu-partitioning-performance-tune.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-create-performance-profiles[Creating a performance profile] - -* xref:../../../edge_computing/ztp-reference-cluster-configuration-for-vdu.adoc#ztp-du-configuring-host-firmware-requirements_sno-configure-for-vdu[Configuring host firmware for low latency and high performance] - -* xref:../../../installing/install_config/enabling-cgroup-v1.adoc#nodes-clusters-cgroups-2-install_nodes-cluster-cgroups-1[Enabling Linux cgroup v1 during installation] - -include::modules/telco-core-service-mesh.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../service_mesh/v2x/ossm-about.adoc#ossm-about[About OpenShift Service Mesh] - -include::modules/telco-core-rds-networking.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../networking/understanding-networking.adoc#understanding-networking[Understanding networking] - -include::modules/telco-core-cluster-network-operator.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../networking/networking_operators/cluster-network-operator.adoc#nw-cluster-network-operator_cluster-network-operator[Cluster Network Operator] - -include::modules/telco-core-load-balancer.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../networking/networking_operators/metallb-operator/about-metallb.adoc#nw-metallb-when-metallb_about-metallb-and-metallb-operator[When to use MetalLB] - -include::modules/telco-core-sriov.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../networking/hardware_networks/about-sriov.adoc#about-sriov[About Single Root I/O Virtualization (SR-IOV) hardware networks] - -* xref:../../../networking/hardware_networks/about-sriov.adoc#supported-devices_about-sriov[Supported devices] - -include::modules/telco-nmstate-operator.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../networking/networking_operators/k8s-nmstate-about-the-k8s-nmstate-operator.adoc#k8s-nmstate-about-the-k8s-nmstate-operator[Kubernetes NMState Operator] - -include::modules/telco-core-logging.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -//* xref:../../../observability/logging/logging-6.0/log6x-about.adoc#log6x-about[About logging] -* link:https://docs.openshift.com/container-platform/4.17/observability/logging/logging-6.0/log6x-about.html[About logging] - -include::modules/telco-core-power-management.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../rest_api/node_apis/performanceprofile-performance-openshift-io-v2.adoc#spec-workloadhints[Performance Profile] - -* xref:../../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-configuring-power-saving-for-nodes_cnf-low-latency-perf-profile[Configuring power saving for nodes] - -* xref:../../../scalability_and_performance/cnf-tuning-low-latency-nodes-with-perf-profile.adoc#cnf-configuring-power-saving-for-nodes_cnf-low-latency-perf-profile[Configuring power saving for nodes that run colocated high and low priority workloads] - -include::modules/telco-core-storage.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../storage/persistent_storage/persistent-storage-ocs.adoc#red-hat-openshift-data-foundation[{rh-storage-first}] - -[id="telco-reference-core-deployment-components_{context}"] -== {rds-caps} deployment components - -The following sections describe the various {product-title} components and configurations that you use to configure the hub cluster with {rh-rhacm-first}. - -include::modules/telco-core-red-hat-advanced-cluster-management-rhacm.adoc[leveloffset=+2] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../edge_computing/ztp-deploying-far-edge-clusters-at-scale.adoc#about-ztp_ztp-deploying-far-edge-clusters-at-scale[Using {ztp} to provision clusters at the network far edge] - -* link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes[Red Hat Advanced Cluster Management for Kubernetes] - -include::modules/telco-ran-topology-aware-lifecycle-manager-talm.adoc[leveloffset=+2] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../edge_computing/cnf-talm-for-cluster-upgrades.adoc#cnf-talm-for-cluster-updates[Updating managed clusters with the {cgu-operator-full}] - -include::modules/telco-ran-gitops-operator-and-ztp-plugins.adoc[leveloffset=+2] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../edge_computing/ztp-preparing-the-hub-cluster.adoc#ztp-preparing-the-ztp-git-repository-ver-ind_ztp-preparing-the-hub-cluster[Preparing the {ztp} site configuration repository for version independence] - -* xref:../../../edge_computing/policygentemplate_for_ztp/ztp-advanced-policy-config.adoc#ztp-adding-new-content-to-gitops-ztp_ztp-advanced-policy-config[Adding custom content to the {ztp} pipeline] - -include::modules/telco-core-agent-based-installer-abi.adoc[leveloffset=+2] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../installing/installing_with_agent_based_installer/installing-with-agent-based-installer.adoc#installing-with-agent-based-installer[Installing an {product-title} cluster with the Agent-based Installer] - -include::modules/telco-core-monitoring.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../observability/monitoring/about-ocp-monitoring/about-ocp-monitoring.adoc#about-ocp-monitoring[About {product-title} monitoring] - -include::modules/telco-core-scheduling.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../nodes/scheduling/nodes-scheduler-about.adoc#nodes-scheduler-about[Controlling pod placement using the scheduler] - -* xref:../../../scalability_and_performance/cnf-numa-aware-scheduling.adoc#cnf-numa-aware-scheduling[Scheduling NUMA-aware workloads] - -* xref:../../../scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc#telco-core-cpu-partitioning-performance-tune_core-ref-design-components[CPU partitioning and performance tuning] - -include::modules/telco-core-node-configuration.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../edge_computing/ztp-reference-cluster-configuration-for-vdu.adoc#ztp-sno-du-enabling-kdump_sno-configure-for-vdu[Automatic kernel crash dumps with kdump] - -* xref:../../../scalability_and_performance/optimization/optimizing-cpu-usage.adoc#optimizing-cpu-usage[Optimizing CPU usage with mount namespace encapsulation] - -include::modules/telco-core-host-firmware-bootloader.adoc[leveloffset=+1] - -include::modules/telco-core-rds-disconnected.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../disconnected/updating/index.adoc#about-disconnected-updates[About cluster updates in a disconnected environment] - -include::modules/telco-core-security.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../authentication/managing-security-context-constraints.adoc#managing-pod-security-policies[Managing security context constraints] - -* xref:../../../scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-design-components.adoc#telco-core-host-firmware-and-bootloader-configuration_core-ref-design-components[Host firmware and boot loader configuration] - -include::modules/telco-core-scalability.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources - -* xref:../../../scalability_and_performance/telco_ref_design_specs/core/telco-core-rds-use-cases.adoc#telco-core-ref-eng-usecase-model_ran-core-design-overview[{rds-caps} RDS engineering considerations] - -:!telco-core: diff --git a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-software-artifacts.adoc b/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-software-artifacts.adoc deleted file mode 100644 index 071d6d453fe7..000000000000 --- a/scalability_and_performance/telco_ref_design_specs/core/telco-core-ref-software-artifacts.adoc +++ /dev/null @@ -1,11 +0,0 @@ -:_mod-docs-content-type: ASSEMBLY -[id="telco-core-ref-software-artifacts"] -= Telco core reference configuration software specifications -:context: core-ref-design-validation -include::_attributes/common-attributes.adoc[] - -toc::[] - -The following information describes the telco core reference design specification (RDS) validated software versions. - -include::modules/telco-core-software-stack.adoc[leveloffset=+1]