From b0b8e85bce33642de86f2f13a81b3544b3f8e778 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 8 Nov 2023 11:04:14 +0100 Subject: [PATCH 01/12] Support flattened data_stream.* fields (#3465) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An input configuration supports flattened fields, however the 'data_stream' field was not being correctly decoded when flattened. This commit fixes this issue. Some small additions and refactoring are also implemented in the integration test framework as well as some more detailed documentation. --------- Co-authored-by: Paolo Chilà --- .buildkite/pipeline.yml | 2 +- ...Support-flattened-data_stream.-fields.yaml | 35 ++ pkg/component/component_test.go | 71 +++ pkg/component/config.go | 80 ++++ pkg/component/config_test.go | 9 +- pkg/testing/fixture.go | 9 +- pkg/testing/tools/estools/elasticsearch.go | 64 ++- testing/integration/logs_ingestion_test.go | 430 ++++++++++++++++++ testing/integration/monitoring_logs_test.go | 213 --------- 9 files changed, 692 insertions(+), 221 deletions(-) create mode 100644 changelog/fragments/1695389490-Support-flattened-data_stream.-fields.yaml create mode 100644 testing/integration/logs_ingestion_test.go delete mode 100644 testing/integration/monitoring_logs_test.go diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0f114dcaa4a..cbd0ae35884 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -140,7 +140,7 @@ steps: key: "serverless-integration-tests" env: TEST_INTEG_AUTH_ESS_REGION: us-east-1 - command: ".buildkite/scripts/steps/integration_tests.sh serverless integration:single TestMonitoringLogsShipped" #right now, run a single test in serverless mode as a sort of smoke test, instead of re-running the entire suite + command: ".buildkite/scripts/steps/integration_tests.sh serverless integration:single TestLogIngestionFleetManaged" #right now, run a single test in serverless mode as a sort of smoke test, instead of re-running the entire suite artifact_paths: - "build/TEST-**" - "build/diagnostics/*" diff --git a/changelog/fragments/1695389490-Support-flattened-data_stream.-fields.yaml b/changelog/fragments/1695389490-Support-flattened-data_stream.-fields.yaml new file mode 100644 index 00000000000..1ce991c2c38 --- /dev/null +++ b/changelog/fragments/1695389490-Support-flattened-data_stream.-fields.yaml @@ -0,0 +1,35 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Support flattened data_stream.* fields in input configuration + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: >- + An input configuration supports flattened fields, however the + 'data_stream' field was not being correctly decoded when + flattened. This commit fixes this issue. + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3465 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/3191 diff --git a/pkg/component/component_test.go b/pkg/component/component_test.go index 424f3a93147..00c4d1c63cb 100644 --- a/pkg/component/component_test.go +++ b/pkg/component/component_test.go @@ -2361,3 +2361,74 @@ func gatherDurationFieldPaths(s interface{}, pathSoFar string) []string { return gatheredPaths } + +func TestFlattenedDataStream(t *testing.T) { + expectedNamespace := "test-namespace" + expectedType := "test-type" + expectedDataset := "test-dataset" + + policy := map[string]any{ + "outputs": map[string]any{ + "default": map[string]any{ + "type": "elasticsearch", + "enabled": true, + }, + }, + "inputs": []any{ + map[string]any{ + "type": "filestream", + "id": "filestream-0", + "enabled": true, + "data_stream.type": expectedType, + "data_stream.dataset": expectedDataset, + "data_stream": map[string]any{ + "namespace": expectedNamespace, + }, + }, + }, + } + runtime, err := LoadRuntimeSpecs(filepath.Join("..", "..", "specs"), PlatformDetail{}, SkipBinaryCheck()) + if err != nil { + t.Fatalf("cannot load runtime specs: %s", err) + } + + result, err := runtime.ToComponents(policy, nil, logp.DebugLevel, nil) + if err != nil { + t.Fatalf("cannot convert policy to component: %s", err) + } + + if len(result) != 1 { + t.Fatalf("expecting result to have one element, got %d", len(result)) + } + + if len(result[0].Units) != 2 { + t.Fatalf("expecting result[0].Units to have two elements, got %d", len(result)) + } + + // We do not make assumptions about ordering. + // Get the input Unit + var dataStream *proto.DataStream + for _, unit := range result[0].Units { + if unit.Err != nil { + t.Fatalf("unit.Err: %s", unit.Err) + } + if unit.Type == client.UnitTypeInput { + dataStream = unit.Config.DataStream + break + } + } + + if dataStream == nil { + t.Fatal("DataStream cannot be nil") + } + + if dataStream.Dataset != expectedDataset { + t.Errorf("expecting DataStream.Dataset: %q, got: %q", expectedDataset, dataStream.Dataset) + } + if dataStream.Type != expectedType { + t.Errorf("expecting DataStream.Type: %q, got: %q", expectedType, dataStream.Type) + } + if dataStream.Namespace != expectedNamespace { + t.Errorf("expecting DataStream.Namespace: %q, got: %q", expectedNamespace, dataStream.Namespace) + } +} diff --git a/pkg/component/config.go b/pkg/component/config.go index a0c75d00e32..781e2e7624f 100644 --- a/pkg/component/config.go +++ b/pkg/component/config.go @@ -15,6 +15,7 @@ import ( "google.golang.org/protobuf/types/known/structpb" "github.com/elastic/elastic-agent-client/v7/pkg/proto" + "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent/pkg/limits" ) @@ -100,9 +101,88 @@ func ExpectedConfig(cfg map[string]interface{}) (*proto.UnitExpectedConfig, erro return nil, err } + if err := updateDataStreamsFromSource(result); err != nil { + return nil, fmt.Errorf("could not dedot 'data_stream': %w", err) + } + return result, nil } +func deDotDataStream(ds *proto.DataStream, source *structpb.Struct) (*proto.DataStream, error) { + if ds == nil { + ds = &proto.DataStream{} + } + + cfg, err := config.NewConfigFrom(source.AsMap()) + if err != nil { + return nil, fmt.Errorf("cannot generate config from source field: %w", err) + } + + // Create a temporary struct to unpack the configuration. + // Unpack correctly handles any flattened fields like + // data_stream.type. So all we need to do is to call Unpack, + // ensure the DataStream does not have a different value, + // them merge them both. + tmp := struct { + DataStream struct { + Dataset string `config:"dataset" yaml:"dataset"` + Type string `config:"type" yaml:"type"` + Namespace string `config:"namespace" yaml:"namespace"` + } `config:"data_stream" yaml:"data_stream"` + }{} + + if err := cfg.Unpack(&tmp); err != nil { + return nil, fmt.Errorf("cannot unpack source field into struct: %w", err) + } + + if (ds.Dataset != tmp.DataStream.Dataset) && (ds.Dataset != "" && tmp.DataStream.Dataset != "") { + return nil, errors.New("duplicated key 'datastream.dataset'") + } + + if (ds.Type != tmp.DataStream.Type) && (ds.Type != "" && tmp.DataStream.Type != "") { + return nil, errors.New("duplicated key 'datastream.type'") + } + + if (ds.Namespace != tmp.DataStream.Namespace) && (ds.Namespace != "" && tmp.DataStream.Namespace != "") { + return nil, errors.New("duplicated key 'datastream.namespace'") + } + + ret := &proto.DataStream{ + Dataset: valueOrDefault(tmp.DataStream.Dataset, ds.Dataset), + Type: valueOrDefault(tmp.DataStream.Type, ds.Type), + Namespace: valueOrDefault(tmp.DataStream.Namespace, ds.Namespace), + Source: ds.GetSource(), + } + + return ret, nil +} + +// valueOrDefault returns b if a is an empty string +func valueOrDefault(a, b string) string { + if a == "" { + return b + } + return a +} + +func updateDataStreamsFromSource(unitConfig *proto.UnitExpectedConfig) error { + var err error + unitConfig.DataStream, err = deDotDataStream(unitConfig.GetDataStream(), unitConfig.GetSource()) + if err != nil { + return fmt.Errorf("could not parse data_stream from input: %w", err) + } + + for i, stream := range unitConfig.Streams { + stream.DataStream, err = deDotDataStream(stream.GetDataStream(), stream.GetSource()) + if err != nil { + return fmt.Errorf("could not parse data_stream from stream [%d]: %w", + i, err) + } + } + + return nil +} + func setSource(val interface{}, cfg map[string]interface{}) error { // find the source field on the val resVal := reflect.ValueOf(val).Elem() diff --git a/pkg/component/config_test.go b/pkg/component/config_test.go index 64dcfe3a697..7cdef177829 100644 --- a/pkg/component/config_test.go +++ b/pkg/component/config_test.go @@ -8,8 +8,10 @@ import ( "errors" "testing" + "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/structpb" "github.com/elastic/elastic-agent-client/v7/pkg/proto" @@ -197,7 +199,12 @@ func TestExpectedConfig(t *testing.T) { assert.Equal(t, err.Error(), scenario.Err.Error()) } else { require.NoError(t, err) - assert.EqualValues(t, scenario.Expected, observed) + // protocmp.Transform ensures we do not compare any internal + // protobuf fields + if !cmp.Equal(scenario.Expected, observed, protocmp.Transform()) { + t.Errorf("mismatch (-want +got) \n%s", + cmp.Diff(scenario.Expected, observed, protocmp.Transform())) + } } }) } diff --git a/pkg/testing/fixture.go b/pkg/testing/fixture.go index 1d852e50277..3774808efaa 100644 --- a/pkg/testing/fixture.go +++ b/pkg/testing/fixture.go @@ -357,7 +357,14 @@ func (f *Fixture) RunBeat(ctx context.Context) error { // Elastic Agent is stopped. If at any time the Elastic Agent logs an error log and the Fixture is not started // with `WithAllowErrors()` then `Run` will exit early and return the logged error. // -// If no `states` are provided then the Elastic Agent runs until the context is or the timeout specified with WithRunLength is reached. +// If no `states` are provided then the Elastic Agent runs until the context is cancelled. +// +// The Elastic-Agent is started agent in test mode (--testing-mode) this mode +// expects the initial configuration (full YAML config) via gRPC. +// This configuration should be passed in the State.Configure field. +// +// The `elastic-agent.yml` generated by `Fixture.Configure` is ignored +// when `Run` is called. func (f *Fixture) Run(ctx context.Context, states ...State) error { if f.binaryName != "elastic-agent" { return errors.New("Run() can only be used with elastic-agent, use RunBeat()") diff --git a/pkg/testing/tools/estools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go index d6bf69369cd..304e917d7ee 100644 --- a/pkg/testing/tools/estools/elasticsearch.go +++ b/pkg/testing/tools/estools/elasticsearch.go @@ -438,9 +438,9 @@ func CheckForErrorsInLogsWithContext(ctx context.Context, client elastictranspor } -// GetLogsForDatastream returns any logs associated with the datastream -func GetLogsForDatastream(client elastictransport.Interface, index string) (Documents, error) { - return GetLogsForDatastreamWithContext(context.Background(), client, index) +// GetLogsForDataset returns any logs associated with the datastream +func GetLogsForDataset(client elastictransport.Interface, index string) (Documents, error) { + return GetLogsForDatasetWithContext(context.Background(), client, index) } // GetLogsForAgentID returns any logs associated with the agent ID @@ -478,8 +478,8 @@ func GetLogsForAgentID(client elastictransport.Interface, id string) (Documents, return handleDocsResponse(res) } -// GetLogsForDatastreamWithContext returns any logs associated with the datastream -func GetLogsForDatastreamWithContext(ctx context.Context, client elastictransport.Interface, index string) (Documents, error) { +// GetLogsForDatasetWithContext returns any logs associated with the datastream +func GetLogsForDatasetWithContext(ctx context.Context, client elastictransport.Interface, index string) (Documents, error) { indexQuery := map[string]interface{}{ "query": map[string]interface{}{ "match": map[string]interface{}{ @@ -536,6 +536,60 @@ func performQueryForRawQuery(ctx context.Context, queryRaw map[string]interface{ return handleDocsResponse(res) } +// GetLogsForDatastream returns any logs associated with the datastream +func GetLogsForDatastream( + ctx context.Context, + client elastictransport.Interface, + dsType, dataset, namespace string) (Documents, error) { + + query := map[string]any{ + "_source": []string{"message"}, + "query": map[string]any{ + "bool": map[string]any{ + "must": []any{ + map[string]any{ + "match": map[string]any{ + "data_stream.dataset": dataset, + }, + }, + map[string]any{ + "match": map[string]any{ + "data_stream.namespace": namespace, + }, + }, + map[string]any{ + "match": map[string]any{ + "data_stream.type": dsType, + }, + }, + }, + }, + }, + } + + var buf bytes.Buffer + if err := json.NewEncoder(&buf).Encode(query); err != nil { + return Documents{}, fmt.Errorf("error creating ES query: %w", err) + } + + es := esapi.New(client) + res, err := es.Search( + es.Search.WithIndex(fmt.Sprintf(".ds-%s*", dsType)), + es.Search.WithExpandWildcards("all"), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + es.Search.WithContext(ctx), + ) + if err != nil { + return Documents{}, fmt.Errorf("error performing ES search: %w", err) + } + + return handleDocsResponse(res) +} + +// handleDocsResponse converts the esapi.Response into Documents, +// it closes the response.Body after reading func handleDocsResponse(res *esapi.Response) (Documents, error) { resultBuf, err := handleResponseRaw(res) if err != nil { diff --git a/testing/integration/logs_ingestion_test.go b/testing/integration/logs_ingestion_test.go new file mode 100644 index 00000000000..ba9a84673b0 --- /dev/null +++ b/testing/integration/logs_ingestion_test.go @@ -0,0 +1,430 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "math/rand" + "net/http" + "net/http/httputil" + "os" + "path/filepath" + "regexp" + "strings" + "testing" + "text/template" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent/pkg/control/v2/client" + atesting "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/pkg/testing/tools" + "github.com/elastic/elastic-agent/pkg/testing/tools/check" + "github.com/elastic/elastic-agent/pkg/testing/tools/estools" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" + "github.com/elastic/elastic-transport-go/v8/elastictransport" +) + +func TestLogIngestionFleetManaged(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: true, + }) + ctx := context.Background() + + agentFixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + // 1. Create a policy in Fleet with monitoring enabled. + // To ensure there are no conflicts with previous test runs against + // the same ESS stack, we add the current time at the end of the policy + // name. This policy does not contain any integration. + t.Log("Enrolling agent in Fleet with a test policy") + createPolicyReq := kibana.AgentPolicy{ + Name: fmt.Sprintf("test-policy-enroll-%d", time.Now().Unix()), + Namespace: info.Namespace, + Description: "test policy for agent enrollment", + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + AgentFeatures: []map[string]interface{}{ + { + "name": "test_enroll", + "enabled": true, + }, + }, + } + + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + } + + // 2. Install the Elastic-Agent with the policy that + // was just created. + policy, err := tools.InstallAgentWithPolicy( + ctx, + t, + installOpts, + agentFixture, + info.KibanaClient, + createPolicyReq) + require.NoError(t, err) + t.Logf("created policy: %s", policy.ID) + check.ConnectedToFleet(t, agentFixture, 5*time.Minute) + + t.Run("Monitoring logs are shipped", func(t *testing.T) { + testMonitoringLogsAreShipped(t, ctx, info, agentFixture, policy) + }) + + t.Run("Normal logs with flattened data_stream are shipped", func(t *testing.T) { + testFlattenedDatastreamFleetPolicy(t, ctx, info, agentFixture, policy) + }) +} + +func testMonitoringLogsAreShipped( + t *testing.T, + ctx context.Context, + info *define.Info, + agentFixture *atesting.Fixture, + policy kibana.PolicyResponse, +) { + // Stage 1: Make sure metricbeat logs are populated + t.Log("Making sure metricbeat logs are populated") + docs := findESDocs(t, func() (estools.Documents, error) { + return estools.GetLogsForDataset(info.ESClient, "elastic_agent.metricbeat") + }) + t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits)) + require.NotZero(t, len(docs.Hits.Hits)) + + // Stage 2: make sure all components are healthy + t.Log("Making sure all components are healthy") + status, err := agentFixture.ExecStatus(ctx) + require.NoError(t, err, + "could not get agent status to verify all components are healthy") + for _, c := range status.Components { + assert.Equalf(t, client.Healthy, client.State(c.State), + "component %s: want %s, got %s", + c.Name, client.Healthy, client.State(c.State)) + } + + // Stage 3: Make sure there are no errors in logs + t.Log("Making sure there are no error logs") + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.CheckForErrorsInLogs(info.ESClient, info.Namespace, []string{ + // acceptable error messages (include reason) + "Error dialing dial tcp 127.0.0.1:9200: connect: connection refused", // beat is running default config before its config gets updated + "Global configuration artifact is not available", // Endpoint: failed to load user artifact due to connectivity issues + "Failed to download artifact", + "Failed to initialize artifact", + "Failed to apply initial policy from on disk configuration", + "elastic-agent-client error: rpc error: code = Canceled desc = context canceled", // can happen on restart + }) + }) + t.Logf("errors: Got %d documents", len(docs.Hits.Hits)) + for _, doc := range docs.Hits.Hits { + t.Logf("%#v", doc.Source) + } + require.Empty(t, docs.Hits.Hits) + + // Stage 4: Make sure we have message confirming central management is running + t.Log("Making sure we have message confirming central management is running") + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.FindMatchingLogLines(info.ESClient, info.Namespace, + "Parsed configuration and determined agent is managed by Fleet") + }) + require.NotZero(t, len(docs.Hits.Hits)) + + // Stage 5: verify logs from the monitoring components are not sent to the output + t.Log("Check monitoring logs") + hostname, err := os.Hostname() + if err != nil { + t.Fatalf("could not get hostname to filter Agent: %s", err) + } + + agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) + require.NoError(t, err, "could not get Agent ID by hostname") + t.Logf("Agent ID: %q", agentID) + + // We cannot search for `component.id` because at the moment of writing + // this field is not mapped. There is an issue for that: + // https://github.com/elastic/integrations/issues/6545 + // TODO: use runtime fields while the above issue is not resolved. + + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.GetLogsForAgentID(info.ESClient, agentID) + }) + require.NoError(t, err, "could not get logs from Agent ID: %q, err: %s", + agentID, err) + + monRegExp := regexp.MustCompile(".*-monitoring$") + for i, d := range docs.Hits.Hits { + // Lazy way to navigate a map[string]any: convert to JSON then + // decode into a struct. + jsonData, err := json.Marshal(d.Source) + if err != nil { + t.Fatalf("could not encode document source as JSON: %s", err) + } + + doc := ESDocument{} + if err := json.Unmarshal(jsonData, &doc); err != nil { + t.Fatalf("could not unmarshal document source: %s", err) + } + + if monRegExp.MatchString(doc.Component.ID) { + t.Errorf("[%d] Document on index %q with 'component.id': %q "+ + "and 'elastic_agent.id': %q. 'elastic_agent.id' must not "+ + "end in '-monitoring'\n", + i, d.Index, doc.Component.ID, doc.ElasticAgent.ID) + } + } +} + +func findESDocs(t *testing.T, findFn func() (estools.Documents, error)) estools.Documents { + var docs estools.Documents + require.Eventually( + t, + func() bool { + var err error + docs, err = findFn() + return err == nil + }, + 3*time.Minute, + 15*time.Second, + ) + + return docs +} + +func testFlattenedDatastreamFleetPolicy( + t *testing.T, + ctx context.Context, + info *define.Info, + agentFixture *atesting.Fixture, + policy kibana.PolicyResponse, +) { + dsType := "logs" + dsNamespace := cleanString(fmt.Sprintf("%snamespace%d", t.Name(), rand.Uint64())) + dsDataset := cleanString(fmt.Sprintf("%s-dataset", t.Name())) + numEvents := 60 + + tempDir := t.TempDir() + logFilePath := filepath.Join(tempDir, "log.log") + generateLogFile(t, logFilePath, 2*time.Millisecond, numEvents) + + agentFixture, err := define.NewFixture(t, define.Version()) + if err != nil { + t.Fatalf("could not create new fixture: %s", err) + } + + // 1. Prepare a request to add an integration to the policy + tmpl, err := template.New(t.Name() + "custom-log-policy").Parse(policyJSON) + if err != nil { + t.Fatalf("cannot parse template: %s", err) + } + + // The time here ensures there are no conflicts with the integration name + // in Fleet. + agentPolicyBuilder := strings.Builder{} + err = tmpl.Execute(&agentPolicyBuilder, policyVars{ + Name: "Log-Input-" + t.Name() + "-" + time.Now().Format(time.RFC3339), + PolicyID: policy.ID, + LogFilePath: logFilePath, + Namespace: dsNamespace, + Dataset: dsDataset, + }) + if err != nil { + t.Fatalf("could not render template: %s", err) + } + // We keep a copy of the policy for debugging prurposes + agentPolicy := agentPolicyBuilder.String() + + // 2. Call Kibana to create the policy. + // Docs: https://www.elastic.co/guide/en/fleet/current/fleet-api-docs.html#create-integration-policy-api + resp, err := info.KibanaClient.Connection.Send( + http.MethodPost, + "/api/fleet/package_policies", + nil, + nil, + bytes.NewBufferString(agentPolicy)) + if err != nil { + t.Fatalf("could not execute request to Kibana/Fleet: %s", err) + } + if resp.StatusCode != http.StatusOK { + // On error dump the whole request response so we can easily spot + // what went wrong. + t.Errorf("received a non 200-OK when adding package to policy. "+ + "Status code: %d", resp.StatusCode) + respDump, err := httputil.DumpResponse(resp, true) + if err != nil { + t.Fatalf("could not dump error response from Kibana: %s", err) + } + // Make debugging as easy as possible + t.Log("================================================================================") + t.Log("Kibana error response:") + t.Log(string(respDump)) + t.Log("================================================================================") + t.Log("Rendered policy:") + t.Log(agentPolicy) + t.Log("================================================================================") + t.FailNow() + } + + require.Eventually( + t, + ensureDocumentsInES(t, ctx, info.ESClient, dsType, dsDataset, dsNamespace, numEvents), + 120*time.Second, + time.Second, + "could not get all expected documents form ES") +} + +// ensureDocumentsInES asserts the documents were ingested into the correct +// datastream +func ensureDocumentsInES( + t *testing.T, + ctx context.Context, + esClient elastictransport.Interface, + dsType, dsDataset, dsNamespace string, + numEvents int, +) func() bool { + + f := func() bool { + t.Helper() + + docs, err := estools.GetLogsForDatastream(ctx, esClient, dsType, dsDataset, dsNamespace) + if err != nil { + t.Logf("error quering ES, will retry later: %s", err) + } + + if docs.Hits.Total.Value == numEvents { + return true + } + + return false + + } + + return f +} + +// generateLogFile generates a log file by appending new lines every tick +// the lines are composed by the test name and the current time in RFC3339Nano +// This function spans a new goroutine and does not block +func generateLogFile(t *testing.T, fullPath string, tick time.Duration, events int) { + t.Helper() + f, err := os.Create(fullPath) + if err != nil { + t.Fatalf("could not create file '%s: %s", fullPath, err) + } + + go func() { + t.Helper() + ticker := time.NewTicker(tick) + t.Cleanup(ticker.Stop) + + done := make(chan struct{}) + t.Cleanup(func() { close(done) }) + + defer func() { + if err := f.Close(); err != nil { + t.Errorf("could not close log file '%s': %s", fullPath, err) + } + }() + + i := 0 + for { + select { + case <-done: + return + case now := <-ticker.C: + i++ + _, err := fmt.Fprintln(f, t.Name(), "Iteration: ", i, now.Format(time.RFC3339Nano)) + if err != nil { + // The Go compiler does not allow me to call t.Fatalf from a non-test + // goroutine, t.Errorf is our only option + t.Errorf("could not write data to log file '%s': %s", fullPath, err) + return + } + // make sure log lines are synced as quickly as possible + if err := f.Sync(); err != nil { + t.Errorf("could not sync file '%s': %s", fullPath, err) + } + if i == events { + return + } + } + } + }() +} + +func cleanString(s string) string { + return nonAlphanumericRegex.ReplaceAllString(strings.ToLower(s), "") +} + +var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z0-9 ]+`) + +var policyJSON = ` +{ + "policy_id": "{{.PolicyID}}", + "package": { + "name": "log", + "version": "2.3.0" + }, + "name": "{{.Name}}", + "namespace": "{{.Namespace}}", + "inputs": { + "logs-logfile": { + "enabled": true, + "streams": { + "log.logs": { + "enabled": true, + "vars": { + "paths": [ + "{{.LogFilePath | js}}" {{/* we need to escape windows paths */}} + ], + "data_stream.dataset": "{{.Dataset}}" + } + } + } + } + } +}` + +type policyVars struct { + Name string + PolicyID string + LogFilePath string + Namespace string + Dataset string +} + +type ESDocument struct { + ElasticAgent ElasticAgent `json:"elastic_agent"` + Component Component `json:"component"` + Host Host `json:"host"` +} +type ElasticAgent struct { + ID string `json:"id"` + Version string `json:"version"` + Snapshot bool `json:"snapshot"` +} +type Component struct { + Binary string `json:"binary"` + ID string `json:"id"` +} +type Host struct { + Hostname string `json:"hostname"` +} diff --git a/testing/integration/monitoring_logs_test.go b/testing/integration/monitoring_logs_test.go deleted file mode 100644 index 5ebce0043de..00000000000 --- a/testing/integration/monitoring_logs_test.go +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -//go:build integration - -package integration - -import ( - "context" - "encoding/json" - "fmt" - "os" - "regexp" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/elastic/elastic-agent-libs/kibana" - "github.com/elastic/elastic-agent/pkg/control/v2/client" - atesting "github.com/elastic/elastic-agent/pkg/testing" - "github.com/elastic/elastic-agent/pkg/testing/define" - "github.com/elastic/elastic-agent/pkg/testing/tools" - "github.com/elastic/elastic-agent/pkg/testing/tools/check" - "github.com/elastic/elastic-agent/pkg/testing/tools/estools" - "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" -) - -func TestMonitoringLogsShipped(t *testing.T) { - info := define.Require(t, define.Requirements{ - Stack: &define.Stack{}, - Local: false, - Sudo: true, - }) - ctx := context.Background() - - t.Logf("got namespace: %s", info.Namespace) - - agentFixture, err := define.NewFixture(t, define.Version()) - require.NoError(t, err) - - t.Log("Enrolling agent in Fleet with a test policy") - createPolicyReq := kibana.AgentPolicy{ - Name: fmt.Sprintf("test-policy-enroll-%d", time.Now().Unix()), - Namespace: info.Namespace, - Description: "test policy for agent enrollment", - MonitoringEnabled: []kibana.MonitoringEnabledOption{ - kibana.MonitoringEnabledLogs, - kibana.MonitoringEnabledMetrics, - }, - AgentFeatures: []map[string]interface{}{ - { - "name": "test_enroll", - "enabled": true, - }, - }, - } - - // Stage 1: Install - // As part of the cleanup process, we'll uninstall the agent - installOpts := atesting.InstallOpts{ - NonInteractive: true, - Force: true, - } - policy, err := tools.InstallAgentWithPolicy(ctx, t, - installOpts, agentFixture, info.KibanaClient, createPolicyReq) - require.NoError(t, err) - t.Logf("created policy: %s", policy.ID) - - check.ConnectedToFleet(t, agentFixture, 5*time.Minute) - - // Stage 2: check indices - // This is mostly for debugging - resp, err := estools.GetAllindicies(info.ESClient) - require.NoError(t, err) - for _, run := range resp { - t.Logf("%s: %d/%d deleted: %d\n", - run.Index, run.DocsCount, run.StoreSizeBytes, run.DocsDeleted) - } - - // Stage 3: Make sure metricbeat logs are populated - t.Log("Making sure metricbeat logs are populated") - docs := findESDocs(t, func() (estools.Documents, error) { - return estools.GetLogsForDatastream(info.ESClient, "elastic_agent.metricbeat") - }) - t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits)) - require.NotZero(t, len(docs.Hits.Hits)) - - // Stage 4: make sure all components are healthy - t.Log("Making sure all components are healthy") - status, err := agentFixture.ExecStatus(ctx) - require.NoError(t, err, - "could not get agent status to verify all components are healthy") - for _, c := range status.Components { - assert.Equalf(t, client.Healthy, client.State(c.State), - "component %s: want %s, got %s", - c.Name, client.Healthy, client.State(c.State)) - } - - // Stage 5: Make sure there are no errors in logs - t.Log("Making sure there are no error logs") - docs = findESDocs(t, func() (estools.Documents, error) { - return estools.CheckForErrorsInLogs(info.ESClient, info.Namespace, []string{ - // acceptable error messages (include reason) - "Error dialing dial tcp 127.0.0.1:9200: connect: connection refused", // beat is running default config before its config gets updated - "Global configuration artifact is not available", // Endpoint: failed to load user artifact due to connectivity issues - "Failed to download artifact", - "Failed to initialize artifact", - "Failed to apply initial policy from on disk configuration", - "elastic-agent-client error: rpc error: code = Canceled desc = context canceled", // can happen on restart - }) - }) - t.Logf("errors: Got %d documents", len(docs.Hits.Hits)) - for _, doc := range docs.Hits.Hits { - t.Logf("%#v", doc.Source) - } - require.Empty(t, docs.Hits.Hits) - - // Stage 6: Make sure we have message confirming central management is running - t.Log("Making sure we have message confirming central management is running") - docs = findESDocs(t, func() (estools.Documents, error) { - return estools.FindMatchingLogLines(info.ESClient, info.Namespace, - "Parsed configuration and determined agent is managed by Fleet") - }) - require.NotZero(t, len(docs.Hits.Hits)) - - // Stage 7: verify logs from the monitoring components are not sent to the output - t.Log("Check monitoring logs") - hostname, err := os.Hostname() - if err != nil { - t.Fatalf("could not get hostname to filter Agent: %s", err) - } - - agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) - require.NoError(t, err, "could not get Agent ID by hostname") - t.Logf("Agent ID: %q", agentID) - - // We cannot search for `component.id` because at the moment of writing - // this field is not mapped. There is an issue for that: - // https://github.com/elastic/integrations/issues/6545 - - docs = findESDocs(t, func() (estools.Documents, error) { - return estools.GetLogsForAgentID(info.ESClient, agentID) - }) - require.NoError(t, err, "could not get logs from Agent ID: %q, err: %s", - agentID, err) - - monRegExp := regexp.MustCompile(".*-monitoring$") - for i, d := range docs.Hits.Hits { - // Lazy way to navigate a map[string]any: convert to JSON then - // decode into a struct. - jsonData, err := json.Marshal(d.Source) - if err != nil { - t.Fatalf("could not encode document source as JSON: %s", err) - } - - doc := ESDocument{} - if err := json.Unmarshal(jsonData, &doc); err != nil { - t.Fatalf("could not unmarshal document source: %s", err) - } - - if monRegExp.MatchString(doc.Component.ID) { - t.Errorf("[%d] Document on index %q with 'component.id': %q "+ - "and 'elastic_agent.id': %q. 'elastic_agent.id' must not "+ - "end in '-monitoring'\n", - i, d.Index, doc.Component.ID, doc.ElasticAgent.ID) - } - } -} - -func findESDocs(t *testing.T, findFn func() (estools.Documents, error)) estools.Documents { - var docs estools.Documents - - require.Eventually( - t, - func() bool { - var err error - docs, err = findFn() - return err == nil - }, - 3*time.Minute, - 15*time.Second, - ) - - // TODO: remove after debugging - t.Log("--- debugging: results from ES --- START ---") - for _, doc := range docs.Hits.Hits { - t.Logf("%#v", doc.Source) - } - t.Log("--- debugging: results from ES --- END ---") - - return docs -} - -type ESDocument struct { - ElasticAgent ElasticAgent `json:"elastic_agent"` - Component Component `json:"component"` - Host Host `json:"host"` -} -type ElasticAgent struct { - ID string `json:"id"` - Version string `json:"version"` - Snapshot bool `json:"snapshot"` -} -type Component struct { - Binary string `json:"binary"` - ID string `json:"id"` -} -type Host struct { - Hostname string `json:"hostname"` -} From 72653ac2b0b57deedd276c3abc8d19f9085b3154 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Wed, 8 Nov 2023 19:25:12 +0100 Subject: [PATCH 02/12] [CI]Buildkite k8s tests migration (#3680) * Buildkite k8s tests * Buildkite k8s tests * Added kind setup script * Added kind setup script * Added kind setup script * Added kind setup script * Branch configuration for k8s tests * transformed to matrix build * transformed to matrix build * transformed to matrix build * transformed to matrix build * transformed to matrix build * transformed to matrix build * transformed to matrix build * Removed k8s tests from Jenkinsfile * Run k8s tests on every PR * Fix review comments * install kind: moved path update to the top * Moved PATH declaration to the top leve script * Removed junit-annotate from k8s tests * Moved k8s tests to upper level in the pipeline --- .buildkite/pipeline.yml | 25 +++++++++++ .buildkite/scripts/install-kind.sh | 45 ++++++++++++++++++++ .buildkite/scripts/install-kubectl.sh | 45 ++++++++++++++++++++ .buildkite/scripts/steps/k8s-tests.sh | 30 +++++++++++++ .buildkite/scripts/steps/unit-tests.sh | 6 ++- .ci/Jenkinsfile | 58 +------------------------- 6 files changed, 151 insertions(+), 58 deletions(-) create mode 100644 .buildkite/scripts/install-kind.sh create mode 100644 .buildkite/scripts/install-kubectl.sh create mode 100644 .buildkite/scripts/steps/k8s-tests.sh diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index cbd0ae35884..fc1ff11871a 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -122,6 +122,31 @@ steps: - unit-tests-macos-13-arm allow_dependency_failure: true + - group: "K8s tests" + key: "k8s-tests" + steps: + - label: "K8s tests: {{matrix.k8s_version}}" + env: + K8S_VERSION: "v{{matrix.k8s_version}}" + KIND_VERSION: "v0.20.0" + command: ".buildkite/scripts/steps/k8s-tests.sh" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + image: "family/core-ubuntu-2204" + matrix: + setup: + k8s_version: + - "1.28.0" + - "1.27.3" + - "1.26.6" + retry: + manual: + allowed: true + - label: ":sonarqube: Continuous Code Inspection" env: VAULT_SONAR_TOKEN_PATH: "kv/ci-shared/platform-ingest/elastic/elastic-agent/sonar-analyze-token" diff --git a/.buildkite/scripts/install-kind.sh b/.buildkite/scripts/install-kind.sh new file mode 100644 index 00000000000..171480d7685 --- /dev/null +++ b/.buildkite/scripts/install-kind.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -exuo pipefail + +echo "--- Install Kind" + +MSG="environment variable missing." +DEFAULT_HOME="/usr/local" +KIND_VERSION=${KIND_VERSION:?$MSG} +HOME=${HOME:?$DEFAULT_HOME} +KIND_CMD="${HOME}/bin/kind" + +if command -v kind +then + set +e + echo "Found Kind. Checking version.." + FOUND_KIND_VERSION=$(kind --version 2>&1 >/dev/null | awk '{print $3}') + if [ "$FOUND_KIND_VERSION" == "$KIND_VERSION" ] + then + echo "Versions match. No need to install Kind. Exiting." + exit 0 + fi + set -e +fi + +echo "Installing Kind" + +OS=$(uname -s| tr '[:upper:]' '[:lower:]') +ARCH=$(uname -m| tr '[:upper:]' '[:lower:]') +if [ "${ARCH}" == "aarch64" ] ; then + ARCH_SUFFIX=arm64 +else + ARCH_SUFFIX=amd64 +fi + +mkdir -p "${HOME}/bin" + +if curl -sSLo "${KIND_CMD}" "https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-${OS}-${ARCH_SUFFIX}" ; then + chmod +x "${KIND_CMD}" +else + echo "Something bad with the download, let's delete the corrupted binary" + if [ -e "${KIND_CMD}" ] ; then + rm "${KIND_CMD}" + fi + exit 1 +fi diff --git a/.buildkite/scripts/install-kubectl.sh b/.buildkite/scripts/install-kubectl.sh new file mode 100644 index 00000000000..fc558448c6c --- /dev/null +++ b/.buildkite/scripts/install-kubectl.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "--- Install kubectl" + +MSG="parameter missing." +DEFAULT_HOME="/usr/local" +K8S_VERSION=${K8S_VERSION:?$MSG} +HOME=${HOME:?$DEFAULT_HOME} +KUBECTL_CMD="${HOME}/bin/kubectl" + +if command -v kubectl +then + set +e + echo "Found kubectl. Checking version.." + FOUND_KUBECTL_VERSION=$(kubectl version --client --short 2>&1 >/dev/null | awk '{print $3}') + if [ "${FOUND_KUBECTL_VERSION}" == "${K8S_VERSION}" ] + then + echo "Versions match. No need to install kubectl. Exiting." + exit 0 + fi + set -e +fi + +echo "Installing kubectl" + +mkdir -p "${HOME}/bin" + +OS=$(uname -s| tr '[:upper:]' '[:lower:]') +ARCH=$(uname -m| tr '[:upper:]' '[:lower:]') +if [ "${ARCH}" == "aarch64" ] ; then + ARCH_SUFFIX=arm64 +else + ARCH_SUFFIX=amd64 +fi + +if curl -sSLo "${KUBECTL_CMD}" "https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/${OS}/${ARCH_SUFFIX}/kubectl" ; then + chmod +x "${KUBECTL_CMD}" +else + echo "Something bad with the download, let's delete the corrupted binary" + if [ -e "${KUBECTL_CMD}" ] ; then + rm "${KUBECTL_CMD}" + fi + exit 1 +fi diff --git a/.buildkite/scripts/steps/k8s-tests.sh b/.buildkite/scripts/steps/k8s-tests.sh new file mode 100644 index 00000000000..bb9affe2edc --- /dev/null +++ b/.buildkite/scripts/steps/k8s-tests.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -euo pipefail + +export PATH=$HOME/bin:${PATH} +source .buildkite/scripts/install-kubectl.sh +source .buildkite/scripts/install-kind.sh + +kind create cluster --image "kindest/node:${K8S_VERSION}" --config - < Date: Thu, 9 Nov 2023 17:45:22 +0100 Subject: [PATCH 03/12] Changelog for 8.11.0 (#3722) (#3727) * Changelog for 8.11.0 * Remove an entry in the changelog (cherry picked from commit 03ef9d8d6a894af61a77d78983a735d263b414ac) # Conflicts: # changelog/fragments/1697229987-Upgrade-to-Go-1.20.10.yaml # changelog/fragments/1698247934-hb-bump-node.yaml Co-authored-by: Pierre HILBERT --- changelog/8.11.0.asciidoc | 107 ++++++++++ changelog/8.11.0.yaml | 182 ++++++++++++++++++ ...cessor-support-for-hints-autodiscover.yaml | 32 --- .../1693427183-install-progress.yaml | 32 --- .../1693813219-log-command-colors.yaml | 31 --- .../1693920861-netinfo-manifests.yaml | 32 --- ...and-kills-any-running-watcher-process.yaml | 32 --- .../1694439479-fix_creatork8sscript.yaml | 32 --- ...534-standalone-prevent-quick-upgrades.yaml | 32 --- ...ture-flag-by-default-for-Agent-8.11.0.yaml | 32 --- ...92-Prevent-multiple-stops-of-services.yaml | 32 --- .../1696319263-add-assetbeat-dependency.yaml | 32 --- .../1696361138-es-default-compression.yaml | 32 --- ...0758-bugfix-upgrade-progress-reporter.yaml | 32 --- ...nt-monitoring-metrics-interval-to-60s.yaml | 32 --- ...-upgrade-elastic-agent-libs-to-v0.6.0.yaml | 32 --- ...restarts-on-failure-to-15s-on-Windows.yaml | 32 --- 17 files changed, 289 insertions(+), 479 deletions(-) create mode 100644 changelog/8.11.0.asciidoc create mode 100644 changelog/8.11.0.yaml delete mode 100644 changelog/fragments/1691060806-Processor-support-for-hints-autodiscover.yaml delete mode 100644 changelog/fragments/1693427183-install-progress.yaml delete mode 100644 changelog/fragments/1693813219-log-command-colors.yaml delete mode 100644 changelog/fragments/1693920861-netinfo-manifests.yaml delete mode 100644 changelog/fragments/1694187216-Uninstall-finds-and-kills-any-running-watcher-process.yaml delete mode 100644 changelog/fragments/1694439479-fix_creatork8sscript.yaml delete mode 100644 changelog/fragments/1695685534-standalone-prevent-quick-upgrades.yaml delete mode 100644 changelog/fragments/1695780865-Enable-tamper-protection-feature-flag-by-default-for-Agent-8.11.0.yaml delete mode 100644 changelog/fragments/1695920792-Prevent-multiple-stops-of-services.yaml delete mode 100644 changelog/fragments/1696319263-add-assetbeat-dependency.yaml delete mode 100644 changelog/fragments/1696361138-es-default-compression.yaml delete mode 100644 changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml delete mode 100644 changelog/fragments/1696955150-Slow-down-agent-monitoring-metrics-interval-to-60s.yaml delete mode 100644 changelog/fragments/1697662109-upgrade-elastic-agent-libs-to-v0.6.0.yaml delete mode 100644 changelog/fragments/1698259940-Increase-wait-period-between-service-restarts-on-failure-to-15s-on-Windows.yaml diff --git a/changelog/8.11.0.asciidoc b/changelog/8.11.0.asciidoc new file mode 100644 index 00000000000..30f534fe478 --- /dev/null +++ b/changelog/8.11.0.asciidoc @@ -0,0 +1,107 @@ +// begin 8.11.0 relnotes + +[[release-notes-8.11.0]] +== 8.11.0 + +Review important information about the 8.11.0 release. + +[discrete] +[[security-updates-8.11.0]] +=== Security updates + + +elastic-agent:: + +* Upgrade To Go 1.20.10. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3601[#https://github.com/elastic/elastic-agent/pull/3601] + +[discrete] +[[breaking-changes-8.11.0]] +=== Breaking changes + +Breaking changes can prevent your application from optimal operation and +performance. Before you upgrade, review the breaking changes, then mitigate the +impact to your application. + +// TODO: add details and impact + +all:: + +[discrete] +[[breaking-https://github.com/elastic/beats/pull/36681]] +.Enable Compression By Default For Elasticsearch Outputs. {all-pull}https://github.com/elastic/beats/pull/36681[#https://github.com/elastic/beats/pull/36681] {all-issue}https://github.com/elastic/ingest-dev/issues/2458[#https://github.com/elastic/ingest-dev/issues/2458] +[%collapsible] +==== +The default compression level for Elasticsearch outputs is changing from 0 to 1. On typical workloads this is expected to decrease network data volume by 70-80%, while increasing cpu use by 20-25% and ingestion time by 10%. The previous behavior can be restored by adding 'compression_level: 0' to the output configuration. +==== +elastic-agent:: + +[discrete] +[[breaking-https://github.com/elastic/elastic-agent/pull/3593]] +.Elastic-Agent-Autodiscover To V0.6.4. Disables Metadata For Deployment And Cronjob. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3593[#https://github.com/elastic/elastic-agent/pull/3593] +[%collapsible] +==== +Elastic-agent-autodiscover library by default comes with add_resource_metadata.deployment=false and add_resource_metadata.cronjob=false. Pods that will be created from deployments or cronjobs will not have the extra metadata field for kubernetes.deployment or kubernetes.cronjob respectively. +==== + + + + + +[discrete] +[[new-features-8.11.0]] +=== New features + +The 8.11.0 release adds the following new and notable features. + + + + +* Enable Tamper Protection Feature Flag By Default For Elastic Agent Version 8.11.0. {-pull}https://github.com/elastic/elastic-agent/pull/3478[#https://github.com/elastic/elastic-agent/pull/3478] +CLI:: + +* Add Colors To Agent Messages Printed By The Elastic-Agent Logs Command Based On Their Level. {CLI-pull}https://github.com/elastic/elastic-agent/pull/3345[#https://github.com/elastic/elastic-agent/pull/3345] +elastic-agent:: + +* Add Support For Processors In Hints-Based Kubernetes Autodiscover. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3107[#https://github.com/elastic/elastic-agent/pull/3107] {elastic-agent-issue}https://github.com/elastic/elastic-agent/issues/2959[#https://github.com/elastic/elastic-agent/issues/2959] +* Print Out Elastic Agent Installation Steps To Show Progress. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3338[#https://github.com/elastic/elastic-agent/pull/3338] + + +[discrete] +[[enhancements-8.11.0]] +=== Enhancements + + + + +* Fix The Kubernetes `Deploy/Kubernetes/Creator_k8.sh` Script To Correcly Exclude Configmaps. {-pull}https://github.com/elastic/elastic-agent/pull/3396[#https://github.com/elastic/elastic-agent/pull/3396] +elastic-agent:: + +* Support The Netinfo Variable In Elastic Kubernetes Manifests. Setting A New Environmental Variable `Elastic_netinfo=False` Globally Disables The `Netinfo.enabled` Parameter Of The `Add_host_metadata` Processor. This Disables The Indexing Of `Host.ip` And `Host.mac` Fields. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3354[#https://github.com/elastic/elastic-agent/pull/3354] +monitoring:: + +* Increase Agent Monitoring Metrics Interval From 10S To 60S To Reduce The Default Ingestion Load And Long Term Storage Requirements. {monitoring-pull}https://github.com/elastic/elastic-agent/pull/3578[#https://github.com/elastic/elastic-agent/pull/3578] + + + + +[discrete] +[[bug-fixes-8.11.0]] +=== Bug fixes + + + + +* The Elastic Agent Uninstall Process Now Finds And Kills Any Running Upgrade Watcher Process. Uninstalls Initiated Within 10 Minutes Of A Previous Upgrade Now Work As Expected. {-pull}https://github.com/elastic/elastic-agent/pull/3384[#https://github.com/elastic/elastic-agent/pull/3384] {-issue}https://github.com/elastic/elastic-agent/issues/3371[#https://github.com/elastic/elastic-agent/issues/3371] +agent:: + +* Upgrade `Elastic-Agent-Libs` To V0.6.0 To Fix The Agent Windows Service Becoming Unresponsive. Fixes Windows Service Timeouts During Wmi Queries And During Service Shutdown. {agent-pull}https://github.com/elastic/elastic-agent/pull/3632[#https://github.com/elastic/elastic-agent/pull/3632] +elastic-agent:: + +* Prevent A Standalone Elastic Agent From Being Upgraded If An Upgrade Is Already In Progress. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3473[#https://github.com/elastic/elastic-agent/pull/3473] {elastic-agent-issue}https://github.com/elastic/elastic-agent/issues/2706[#https://github.com/elastic/elastic-agent/issues/2706] +* Fix A Bug That Affected Reporting Progress Of The Agent Artifact Download During An Upgrade. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3548[#https://github.com/elastic/elastic-agent/pull/3548] +* Increase Wait Period Between Service Restarts On Failure To 15S On Windows. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3657[#https://github.com/elastic/elastic-agent/pull/3657] +runtime:: + +* Prevent Multiple Attempts To Stop An Already Stopped Service. {runtime-pull}https://github.com/elastic/elastic-agent/pull/3482[#https://github.com/elastic/elastic-agent/pull/3482] + +// end 8.11.0 relnotes diff --git a/changelog/8.11.0.yaml b/changelog/8.11.0.yaml new file mode 100644 index 00000000000..b20c71f12e7 --- /dev/null +++ b/changelog/8.11.0.yaml @@ -0,0 +1,182 @@ +version: 8.11.0 +entries: + - kind: feature + summary: Add support for processors in hints-based Kubernetes autodiscover. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3107 + issue: + - https://github.com/elastic/elastic-agent/issues/2959 + timestamp: 1691060806 + file: + name: 1691060806-Processor-support-for-hints-autodiscover.yaml + checksum: c55b54ba6747d5ac659649fd4e4a16e941364f8e + - kind: feature + summary: Print out Elastic Agent installation steps to show progress. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3338 + issue: [] + timestamp: 1693427183 + file: + name: 1693427183-install-progress.yaml + checksum: fb4c231a3a23abd6be758a4342484d10ce73aa6c + - kind: feature + summary: Add colors to Agent messages printed by the elastic-agent logs command based on their level. + description: "" + component: CLI + pr: + - https://github.com/elastic/elastic-agent/pull/3345 + issue: [] + timestamp: 1693813219 + file: + name: 1693813219-log-command-colors.yaml + checksum: 53b5a44c3799fa9888ad7beba34c2858bca7b3e2 + - kind: enhancement + summary: Support the NETINFO variable in Elastic Kubernetes manifests. Setting a new environmental variable `ELASTIC_NETINFO=false` globally disables the `netinfo.enabled` parameter of the `add_host_metadata` processor. This disables the indexing of `host.ip` and `host.mac` fields. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3354 + issue: [] + timestamp: 1693920861 + file: + name: 1693920861-netinfo-manifests.yaml + checksum: e9e5a459e64a189b7b74f5689c57d41d4fbf442c + - kind: bug-fix + summary: The Elastic Agent uninstall process now finds and kills any running upgrade Watcher process. Uninstalls initiated within 10 minutes of a previous upgrade now work as expected. + description: "" + component: "" + pr: + - https://github.com/elastic/elastic-agent/pull/3384 + issue: + - https://github.com/elastic/elastic-agent/issues/3371 + timestamp: 1694187216 + file: + name: 1694187216-Uninstall-finds-and-kills-any-running-watcher-process.yaml + checksum: 31df28d9490d0c844b88d48f1aa3504cce2365ee + - kind: enhancement + summary: Fix the Kubernetes `deploy/kubernetes/creator_k8.sh` script to correcly exclude configmaps. + description: "" + component: "" + pr: + - https://github.com/elastic/elastic-agent/pull/3396 + issue: [] + timestamp: 1694439479 + file: + name: 1694439479-fix_creatork8sscript.yaml + checksum: c65f7e65ebac64f9332a407f2449c88e30bd4239 + - kind: bug-fix + summary: Prevent a standalone Elastic Agent from being upgraded if an upgrade is already in progress. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3473 + issue: + - https://github.com/elastic/elastic-agent/issues/2706 + timestamp: 1695685534 + file: + name: 1695685534-standalone-prevent-quick-upgrades.yaml + checksum: 9df813abbdf5ae1558b16b1052a5bbc238cf7ae1 + - kind: feature + summary: Enable tamper protection feature flag by default for Elastic Agent version 8.11.0. + description: "" + component: "" + pr: + - https://github.com/elastic/elastic-agent/pull/3478 + issue: [] + timestamp: 1695780865 + file: + name: 1695780865-Enable-tamper-protection-feature-flag-by-default-for-Agent-8.11.0.yaml + checksum: 46eac90f4af827db837fc7d19c1131328b011952 + - kind: bug-fix + summary: Prevent multiple attempts to stop an already stopped service. + description: "" + component: runtime + pr: + - https://github.com/elastic/elastic-agent/pull/3482 + issue: [] + timestamp: 1695920792 + file: + name: 1695920792-Prevent-multiple-stops-of-services.yaml + checksum: 8a0b54855b27c98e6af3d60df1c9cb04f935efee + - kind: breaking-change + summary: Enable compression by default for Elasticsearch outputs + description: 'The default compression level for Elasticsearch outputs is changing from 0 to 1. On typical workloads this is expected to decrease network data volume by 70-80%, while increasing cpu use by 20-25% and ingestion time by 10%. The previous behavior can be restored by adding ''compression_level: 0'' to the output configuration.' + component: all + pr: + - https://github.com/elastic/beats/pull/36681 + issue: + - https://github.com/elastic/ingest-dev/issues/2458 + timestamp: 1696361138 + file: + name: 1696361138-es-default-compression.yaml + checksum: f3bee390b1af6bc5049315e25e5c14d0ff23240e + - kind: bug-fix + summary: Fix a bug that affected reporting progress of the Agent artifact download during an upgrade. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3548 + issue: [] + timestamp: 1696530758 + file: + name: 1696530758-bugfix-upgrade-progress-reporter.yaml + checksum: a168f2ab2a26e7813bb230b5f663ecbeca9d624c + - kind: enhancement + summary: Increase Agent monitoring metrics interval from 10s to 60s to reduce the default ingestion load and long term storage requirements. + description: "" + component: monitoring + pr: + - https://github.com/elastic/elastic-agent/pull/3578 + issue: [] + timestamp: 1696955150 + file: + name: 1696955150-Slow-down-agent-monitoring-metrics-interval-to-60s.yaml + checksum: ead85ab471d5a6f609c367d57817d1c2299d93f2 + - kind: breaking-change + summary: Elastic-agent-autodiscover to v0.6.4. Disables metadata for deployment and cronjob + description: Elastic-agent-autodiscover library by default comes with add_resource_metadata.deployment=false and add_resource_metadata.cronjob=false. Pods that will be created from deployments or cronjobs will not have the extra metadata field for kubernetes.deployment or kubernetes.cronjob respectively. + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3593 + issue: [] + timestamp: 1697103197 + file: + name: 1697103197-updating_agentautodiscovery_811.yaml + checksum: b937c2a635860c3497502b018b8b997143ba5fc3 + - kind: security + summary: Upgrade to Go 1.20.10. + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3601 + issue: [] + timestamp: 1697229987 + file: + name: 1697229987-Upgrade-to-Go-1.20.10.yaml + checksum: e31fa72a840bb0616000830d86a3b18f1a22aae6 + - kind: bug-fix + summary: Upgrade `elastic-agent-libs` to v0.6.0 to fix the Agent Windows service becoming unresponsive. Fixes Windows service timeouts during WMI queries and during service shutdown. + description: "" + component: agent + pr: + - https://github.com/elastic/elastic-agent/pull/3632 + issue: [] + timestamp: 1697662109 + file: + name: 1697662109-upgrade-elastic-agent-libs-to-v0.6.0.yaml + checksum: 70d3e8469fce838d73781c409d9dd7592adbddc8 + - kind: bug-fix + summary: Increase wait period between service restarts on failure to 15s on Windows. + description: This is the same value used by other Elastic windows services like endpoint-security. + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3657 + issue: [] + timestamp: 1698259940 + file: + name: 1698259940-Increase-wait-period-between-service-restarts-on-failure-to-15s-on-Windows.yaml + checksum: 12985bc12758581d4f236ebf0928fde307ffceca diff --git a/changelog/fragments/1691060806-Processor-support-for-hints-autodiscover.yaml b/changelog/fragments/1691060806-Processor-support-for-hints-autodiscover.yaml deleted file mode 100644 index 0bed8190838..00000000000 --- a/changelog/fragments/1691060806-Processor-support-for-hints-autodiscover.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Add support for processors in hints' based k8s autodiscover - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3107 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/elastic-agent/issues/2959 diff --git a/changelog/fragments/1693427183-install-progress.yaml b/changelog/fragments/1693427183-install-progress.yaml deleted file mode 100644 index 1725e9e40f1..00000000000 --- a/changelog/fragments/1693427183-install-progress.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Print out Elastic Agent installation steps to show progress - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3338 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1693813219-log-command-colors.yaml b/changelog/fragments/1693813219-log-command-colors.yaml deleted file mode 100644 index cdd5e34a369..00000000000 --- a/changelog/fragments/1693813219-log-command-colors.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Add colors for log messages based on their level - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: CLI -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1693920861-netinfo-manifests.yaml b/changelog/fragments/1693920861-netinfo-manifests.yaml deleted file mode 100644 index 9c167de170a..00000000000 --- a/changelog/fragments/1693920861-netinfo-manifests.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: enhancement - -# Change summary; a 80ish characters long description of the change. -summary: Setting a new environmental variable ELASTIC_NETINFO=false globally disables the netinfo.enabled parameter of add_host_metadata processor. This disables the indexing of host.ip and host.mac fields. - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1694187216-Uninstall-finds-and-kills-any-running-watcher-process.yaml b/changelog/fragments/1694187216-Uninstall-finds-and-kills-any-running-watcher-process.yaml deleted file mode 100644 index 057cc06d33e..00000000000 --- a/changelog/fragments/1694187216-Uninstall-finds-and-kills-any-running-watcher-process.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Uninstall finds and kills any running watcher process - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3384 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/elastic-agent/issues/3371 diff --git a/changelog/fragments/1694439479-fix_creatork8sscript.yaml b/changelog/fragments/1694439479-fix_creatork8sscript.yaml deleted file mode 100644 index 7f7a9d1fd95..00000000000 --- a/changelog/fragments/1694439479-fix_creatork8sscript.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug - -# Change summary; a 80ish characters long description of the change. -summary: Fixing deploy/kubernetes/creator_k8.sh script to correcly exclude configmaps - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1695685534-standalone-prevent-quick-upgrades.yaml b/changelog/fragments/1695685534-standalone-prevent-quick-upgrades.yaml deleted file mode 100644 index fffa59953be..00000000000 --- a/changelog/fragments/1695685534-standalone-prevent-quick-upgrades.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Prevent a standalone Elastic Agent from being upgraded if an upgrade is already in progress. - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3473 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/elastic-agent/issues/2706 diff --git a/changelog/fragments/1695780865-Enable-tamper-protection-feature-flag-by-default-for-Agent-8.11.0.yaml b/changelog/fragments/1695780865-Enable-tamper-protection-feature-flag-by-default-for-Agent-8.11.0.yaml deleted file mode 100644 index 75a33566b4b..00000000000 --- a/changelog/fragments/1695780865-Enable-tamper-protection-feature-flag-by-default-for-Agent-8.11.0.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Enable tamper protection feature flag by default for Agent 8.11.0 - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3478 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1695920792-Prevent-multiple-stops-of-services.yaml b/changelog/fragments/1695920792-Prevent-multiple-stops-of-services.yaml deleted file mode 100644 index e15f5d6e927..00000000000 --- a/changelog/fragments/1695920792-Prevent-multiple-stops-of-services.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Prevent multiple attempts to stop an already stopped service - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: runtime - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1696319263-add-assetbeat-dependency.yaml b/changelog/fragments/1696319263-add-assetbeat-dependency.yaml deleted file mode 100644 index ab428367953..00000000000 --- a/changelog/fragments/1696319263-add-assetbeat-dependency.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Add assetbeat among the external dependencies needed to package Elastic Agent - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/obs-infraobs-team/issues/1114 diff --git a/changelog/fragments/1696361138-es-default-compression.yaml b/changelog/fragments/1696361138-es-default-compression.yaml deleted file mode 100644 index 2323467e47f..00000000000 --- a/changelog/fragments/1696361138-es-default-compression.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: breaking-change - -# Change summary; a 80ish characters long description of the change. -summary: Enable compression by default for Elasticsearch outputs - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -description: "The default compression level for Elasticsearch outputs is changing from 0 to 1. On typical workloads this is expected to decrease network data volume by 70-80%, while increasing cpu use by 20-25% and ingestion time by 10%. The previous behavior can be restored by adding 'compression_level: 0' to the output configuration." - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: all - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/beats/pull/36681 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/ingest-dev/issues/2458 diff --git a/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml b/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml deleted file mode 100644 index d39f4fa2f41..00000000000 --- a/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Periodically report progress of Elastic Agent artifact download during upgrade - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3548 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1696955150-Slow-down-agent-monitoring-metrics-interval-to-60s.yaml b/changelog/fragments/1696955150-Slow-down-agent-monitoring-metrics-interval-to-60s.yaml deleted file mode 100644 index bf86933d97e..00000000000 --- a/changelog/fragments/1696955150-Slow-down-agent-monitoring-metrics-interval-to-60s.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: enhancement - -# Change summary; a 80ish characters long description of the change. -summary: Increase agent monitoring metrics interval from 10s to 60s to reduce load - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: monitoring - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1697662109-upgrade-elastic-agent-libs-to-v0.6.0.yaml b/changelog/fragments/1697662109-upgrade-elastic-agent-libs-to-v0.6.0.yaml deleted file mode 100644 index 192434f3b5c..00000000000 --- a/changelog/fragments/1697662109-upgrade-elastic-agent-libs-to-v0.6.0.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: upgrade elastic-agent-libs to v0.6.0 - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -description: 'allows elastic-agent running as a windows service to receive more than one change request.' - -# Affected component; a word indicating the component this changeset affects. -component: agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/changelog/fragments/1698259940-Increase-wait-period-between-service-restarts-on-failure-to-15s-on-Windows.yaml b/changelog/fragments/1698259940-Increase-wait-period-between-service-restarts-on-failure-to-15s-on-Windows.yaml deleted file mode 100644 index d8c46d8e4dc..00000000000 --- a/changelog/fragments/1698259940-Increase-wait-period-between-service-restarts-on-failure-to-15s-on-Windows.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Increase wait period between service restarts on failure to 15s on Windows. - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -description: This is the same value used by other Elastic windows services like endpoint-security. - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: "elastic-agent" - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -#pr: https://github.com/owner/repo/1234 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 From f7dcbd74ce0e27d3b46b478755168c66d78dc111 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Fri, 10 Nov 2023 10:14:12 -0500 Subject: [PATCH 04/12] [Non-Root] Add support for running without root permissions on Linux (#3598) * Work on non-root. * Only create group and user if not existing. * More work on non-root for darwin. * Fix FindUID. * More fixes. * Fix service. Add linux user commands. * Fix control socket path. * Trim unix://. * Fix permissions on socket path create. * More socket fixes. * Use MkdirAll. * Fix SysProcAttr. * Only Linux. * mage check * Add non-root install integration tests. * Fix imports. * Fix windows build. * Testing fixes. * Fix non-root base path test. * Empty commit. * More non-root fixes. * Fix issue from merge. * Upgrade unpack prevent world permissions. * Add group permissions for logs in non-root mode. * Fix permissions for internal logger. * Fixes from code review. * Change to using a structure. * More code review fixes. * Update to fmt.Errorf. --- internal/pkg/agent/application/info/state.go | 6 +- .../pkg/agent/application/info/state_unix.go | 22 ++ .../agent/application/info/state_windows.go | 16 ++ internal/pkg/agent/application/paths/paths.go | 6 + .../agent/application/paths/paths_darwin.go | 6 + .../agent/application/paths/paths_windows.go | 3 + .../pkg/agent/application/upgrade/rollback.go | 28 ++- .../agent/application/upgrade/step_unpack.go | 17 +- internal/pkg/agent/cmd/enroll_cmd.go | 4 +- internal/pkg/agent/cmd/install.go | 22 +- internal/pkg/agent/cmd/install_enroll.go | 24 +++ .../pkg/agent/cmd/install_enroll_windows.go | 18 ++ internal/pkg/agent/cmd/run.go | 3 +- internal/pkg/agent/install/install.go | 107 +++++++--- internal/pkg/agent/install/install_unix.go | 30 +++ internal/pkg/agent/install/install_windows.go | 7 + internal/pkg/agent/install/installed.go | 1 + internal/pkg/agent/install/perms_unix.go | 17 +- internal/pkg/agent/install/perms_windows.go | 18 +- internal/pkg/agent/install/svc.go | 42 +++- internal/pkg/agent/install/uninstall.go | 1 + internal/pkg/agent/install/user.go | 14 ++ internal/pkg/agent/install/user_darwin.go | 179 ++++++++++++++++ internal/pkg/agent/install/user_linux.go | 94 +++++++++ internal/pkg/agent/install/user_windows.go | 34 +++ internal/pkg/agent/vault/vault_windows.go | 6 +- pkg/control/addr.go | 7 +- pkg/control/v2/server/listener.go | 14 +- pkg/control/v2/server/listener_windows.go | 21 +- pkg/core/logger/logger.go | 15 +- pkg/testing/fixture_install.go | 10 +- pkg/utils/perm_unix.go | 14 ++ pkg/utils/perm_windows.go | 22 ++ pkg/utils/root_unix.go | 2 +- .../integration/install_unprivileged_test.go | 198 ++++++++++++++++++ 35 files changed, 934 insertions(+), 94 deletions(-) create mode 100644 internal/pkg/agent/application/info/state_unix.go create mode 100644 internal/pkg/agent/application/info/state_windows.go create mode 100644 internal/pkg/agent/cmd/install_enroll.go create mode 100644 internal/pkg/agent/cmd/install_enroll_windows.go create mode 100644 internal/pkg/agent/install/user.go create mode 100644 internal/pkg/agent/install/user_darwin.go create mode 100644 internal/pkg/agent/install/user_linux.go create mode 100644 internal/pkg/agent/install/user_windows.go create mode 100644 testing/integration/install_unprivileged_test.go diff --git a/internal/pkg/agent/application/info/state.go b/internal/pkg/agent/application/info/state.go index 4242be6cb54..1f72e04d4fc 100644 --- a/internal/pkg/agent/application/info/state.go +++ b/internal/pkg/agent/application/info/state.go @@ -9,6 +9,7 @@ import ( "path/filepath" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/pkg/utils" ) // MarkerFileName is the name of the file that's created by @@ -28,11 +29,10 @@ func RunningInstalled() bool { return true } -func CreateInstallMarker(topPath string) error { +func CreateInstallMarker(topPath string, ownership utils.FileOwner) error { markerFilePath := filepath.Join(topPath, MarkerFileName) if _, err := os.Create(markerFilePath); err != nil { return err } - - return nil + return fixInstallMarkerPermissions(markerFilePath, ownership) } diff --git a/internal/pkg/agent/application/info/state_unix.go b/internal/pkg/agent/application/info/state_unix.go new file mode 100644 index 00000000000..23f091aa4bb --- /dev/null +++ b/internal/pkg/agent/application/info/state_unix.go @@ -0,0 +1,22 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build !windows + +package info + +import ( + "fmt" + "os" + + "github.com/elastic/elastic-agent/pkg/utils" +) + +func fixInstallMarkerPermissions(markerFilePath string, ownership utils.FileOwner) error { + err := os.Chown(markerFilePath, ownership.UID, ownership.GID) + if err != nil { + return fmt.Errorf("failed to chown %d:%d %s: %w", ownership.UID, ownership.GID, markerFilePath, err) + } + return nil +} diff --git a/internal/pkg/agent/application/info/state_windows.go b/internal/pkg/agent/application/info/state_windows.go new file mode 100644 index 00000000000..7997c2d0f7d --- /dev/null +++ b/internal/pkg/agent/application/info/state_windows.go @@ -0,0 +1,16 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build windows + +package info + +import ( + "github.com/elastic/elastic-agent/pkg/utils" +) + +func fixInstallMarkerPermissions(markerFilePath string, ownership utils.FileOwner) error { + // TODO(blakerouse): Fix the market permissions on Windows. + return nil +} diff --git a/internal/pkg/agent/application/paths/paths.go b/internal/pkg/agent/application/paths/paths.go index 71089df00ed..af500b46695 100644 --- a/internal/pkg/agent/application/paths/paths.go +++ b/internal/pkg/agent/application/paths/paths.go @@ -17,6 +17,12 @@ const ( // ControlSocketPath is the control socket path used when installed. ControlSocketPath = "unix:///run/elastic-agent.sock" + // ControlSocketUnprivilegedPath is the control socket path used when installed as non-root. + // This must exist inside of a directory in '/run/' because the permissions need to be set + // on that directory during installation time, because once the service is spawned it will not + // have permissions to create the socket in the '/run/' directory. + ControlSocketUnprivilegedPath = "unix:///run/elastic-agent/elastic-agent.sock" + // ShipperSocketPipePattern is the socket path used when installed for a shipper pipe. ShipperSocketPipePattern = "unix:///run/elastic-agent-%s-pipe.sock" diff --git a/internal/pkg/agent/application/paths/paths_darwin.go b/internal/pkg/agent/application/paths/paths_darwin.go index 475f1db5ae2..54c310b8a4d 100644 --- a/internal/pkg/agent/application/paths/paths_darwin.go +++ b/internal/pkg/agent/application/paths/paths_darwin.go @@ -17,6 +17,12 @@ const ( // ControlSocketPath is the control socket path used when installed. ControlSocketPath = "unix:///var/run/elastic-agent.sock" + // ControlSocketUnprivilegedPath is the control socket path used when installed as non-root. + // This must exist inside of a directory in '/var/run/' because the permissions need to be set + // on that directory during installation time, because once the service is spawned it will not + // have permissions to create the socket in the '/var/run/' directory. + ControlSocketUnprivilegedPath = "unix:///var/run/elastic-agent/elastic-agent.sock" + // ShipperSocketPipePattern is the socket path used when installed for a shipper pipe. ShipperSocketPipePattern = "unix:///var/run/elastic-agent-%s-pipe.sock" diff --git a/internal/pkg/agent/application/paths/paths_windows.go b/internal/pkg/agent/application/paths/paths_windows.go index a4e8e0896a5..b54b1ebe898 100644 --- a/internal/pkg/agent/application/paths/paths_windows.go +++ b/internal/pkg/agent/application/paths/paths_windows.go @@ -22,6 +22,9 @@ const ( // ControlSocketPath is the control socket path used when installed. ControlSocketPath = `\\.\pipe\elastic-agent-system` + // ControlSocketUnprivilegedPath is the control socket path used when installed as non-root. + ControlSocketUnprivilegedPath = ControlSocketPath + // ShipperSocketPipePattern is the socket path used when installed for a shipper pipe. ShipperSocketPipePattern = `\\.\pipe\elastic-agent-%s-pipe.sock` diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index ae3c568acf1..bea5c8c4f23 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -12,16 +12,17 @@ import ( "strings" "time" - "github.com/elastic/elastic-agent/pkg/control" - "github.com/elastic/elastic-agent/pkg/control/v2/client" - "github.com/hashicorp/go-multierror" + "google.golang.org/grpc" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/agent/install" "github.com/elastic/elastic-agent/internal/pkg/core/backoff" + "github.com/elastic/elastic-agent/pkg/control" + "github.com/elastic/elastic-agent/pkg/control/v2/client" "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/pkg/utils" ) const ( @@ -137,7 +138,9 @@ func InvokeWatcher(log *logger.Logger) error { func restartAgent(ctx context.Context, log *logger.Logger) error { restartViaDaemonFn := func(ctx context.Context) error { c := client.New() - err := c.Connect(ctx) + connectCtx, connectCancel := context.WithTimeout(ctx, 3*time.Second) + defer connectCancel() + err := c.Connect(connectCtx, grpc.WithBlock(), grpc.WithDisableRetry()) if err != nil { return errors.New(err, "failed communicating to running daemon", errors.TypeNetwork, errors.M("socket", control.Address())) } @@ -163,6 +166,7 @@ func restartAgent(ctx context.Context, log *logger.Logger) error { signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, restartBackoffInit, restartBackoffMax) + root, _ := utils.HasRoot() // error ignored for restartAttempt := 1; restartAttempt <= maxRestartCount; restartAttempt++ { backExp.Wait() @@ -175,19 +179,21 @@ func restartAgent(ctx context.Context, log *logger.Logger) error { } log.Warnf("Failed to restart agent via control protocol: %s", err.Error()) - // Next, try to restart Agent via the service. - log.Infof("Restarting Agent via service; attempt %d of %d", restartAttempt, maxRestartCount) - err = restartViaServiceFn(ctx) - if err == nil { - break + // Next, try to restart Agent via the service. (only if root) + if root { + log.Infof("Restarting Agent via service; attempt %d of %d", restartAttempt, maxRestartCount) + err = restartViaServiceFn(ctx) + if err == nil { + break + } + log.Warnf("Failed to restart agent via service: %s", err.Error()) } if restartAttempt == maxRestartCount { log.Error("Failed to restart agent after final attempt") return err } - - log.Warnf("Failed to restart agent via service: %s; will try again in %v", err.Error(), backExp.NextWait()) + log.Warnf("Failed to restart agent; will try again in %v", backExp.NextWait()) } close(signal) diff --git a/internal/pkg/agent/application/upgrade/step_unpack.go b/internal/pkg/agent/application/upgrade/step_unpack.go index 45d007e55f4..c418e54b182 100644 --- a/internal/pkg/agent/application/upgrade/step_unpack.go +++ b/internal/pkg/agent/application/upgrade/step_unpack.go @@ -85,11 +85,13 @@ func unzip(log *logger.Logger, archivePath string) (string, error) { if f.FileInfo().IsDir() { log.Debugw("Unpacking directory", "archive", "zip", "file.path", path) - _ = os.MkdirAll(path, f.Mode()) + // remove any world permissions from the directory + _ = os.MkdirAll(path, f.Mode()&0770) } else { log.Debugw("Unpacking file", "archive", "zip", "file.path", path) - _ = os.MkdirAll(filepath.Dir(path), f.Mode()) - f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) + // remove any world permissions from the directory/file + _ = os.MkdirAll(filepath.Dir(path), f.Mode()&0770) + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()&0770) if err != nil { return err } @@ -190,11 +192,13 @@ func untar(log *logger.Logger, version string, archivePath string) (string, erro case mode.IsRegular(): log.Debugw("Unpacking file", "archive", "tar", "file.path", abs) // just to be sure, it should already be created by Dir type - if err := os.MkdirAll(filepath.Dir(abs), 0755); err != nil { + // remove any world permissions from the directory + if err := os.MkdirAll(filepath.Dir(abs), mode.Perm()&0770); err != nil { return "", errors.New(err, "TarInstaller: creating directory for file "+abs, errors.TypeFilesystem, errors.M(errors.MetaKeyPath, abs)) } - wf, err := os.OpenFile(abs, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode.Perm()) + // remove any world permissions from the file + wf, err := os.OpenFile(abs, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode.Perm()&0770) if err != nil { return "", errors.New(err, "TarInstaller: creating file "+abs, errors.TypeFilesystem, errors.M(errors.MetaKeyPath, abs)) } @@ -209,7 +213,8 @@ func untar(log *logger.Logger, version string, archivePath string) (string, erro } case mode.IsDir(): log.Debugw("Unpacking directory", "archive", "tar", "file.path", abs) - if err := os.MkdirAll(abs, 0755); err != nil { + // remove any world permissions from the directory + if err := os.MkdirAll(abs, mode.Perm()&0770); err != nil { return "", errors.New(err, "TarInstaller: creating directory for file "+abs, errors.TypeFilesystem, errors.M(errors.MetaKeyPath, abs)) } default: diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index b5992f10188..055807ef16c 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -15,6 +15,8 @@ import ( "strings" "time" + "github.com/elastic/elastic-agent/pkg/utils" + "github.com/elastic/elastic-agent/pkg/control/v2/client" "go.elastic.co/apm" @@ -260,7 +262,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } if c.options.FixPermissions { - err = install.FixPermissions(paths.Top()) + err = install.FixPermissions(paths.Top(), utils.CurrentFileOwner()) if err != nil { return errors.New(err, "failed to fix permissions") } diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index a5b425d10ea..2b96dae2d7a 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "github.com/spf13/cobra" @@ -21,7 +22,10 @@ import ( "github.com/elastic/elastic-agent/pkg/utils" ) -const flagInstallBasePath = "base-path" +const ( + flagInstallBasePath = "base-path" + flagInstallUnprivileged = "unprivileged" +) func newInstallCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command { cmd := &cobra.Command{ @@ -43,6 +47,7 @@ would like the Agent to operate. cmd.Flags().BoolP("force", "f", false, "Force overwrite the current installation and do not prompt for confirmation") cmd.Flags().BoolP("non-interactive", "n", false, "Install Elastic Agent in non-interactive mode which will not prompt on missing parameters but fails instead.") cmd.Flags().String(flagInstallBasePath, paths.DefaultBasePath, "The path where the Elastic Agent will be installed. It must be an absolute path.") + cmd.Flags().Bool(flagInstallUnprivileged, false, "Installed Elastic Agent will create an 'elastic-agent' user and run as that user.") addEnrollFlags(cmd) return cmd @@ -67,6 +72,12 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("unable to perform install command, not executed with %s permissions", utils.PermissionUser) } + // only support Linux at the moment + unprivileged, _ := cmd.Flags().GetBool(flagInstallUnprivileged) + if unprivileged && runtime.GOOS != "linux" { + return fmt.Errorf("unable to perform install command, unprivileged is currently only supported on Linux") + } + topPath := paths.InstallPath(basePath) status, reason := install.Status(topPath) @@ -175,9 +186,10 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { progBar := install.CreateAndStartNewSpinner(streams.Out, "Installing Elastic Agent...") + var ownership utils.FileOwner cfgFile := paths.ConfigFile() if status != install.PackageInstall { - err = install.Install(cfgFile, topPath, progBar, streams) + ownership, err = install.Install(cfgFile, topPath, unprivileged, progBar, streams) if err != nil { return fmt.Errorf("error installing package: %w", err) } @@ -225,6 +237,10 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { enrollCmd.Stdin = os.Stdin enrollCmd.Stdout = os.Stdout enrollCmd.Stderr = os.Stderr + err = enrollCmdExtras(enrollCmd, ownership) + if err != nil { + return err + } progBar.Describe("Enrolling Elastic Agent with Fleet") err = enrollCmd.Start() @@ -243,7 +259,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { progBar.Describe("Enroll Completed") } - if err := info.CreateInstallMarker(topPath); err != nil { + if err := info.CreateInstallMarker(topPath, ownership); err != nil { return fmt.Errorf("failed to create install marker: %w", err) } diff --git a/internal/pkg/agent/cmd/install_enroll.go b/internal/pkg/agent/cmd/install_enroll.go new file mode 100644 index 00000000000..8f7677ebed6 --- /dev/null +++ b/internal/pkg/agent/cmd/install_enroll.go @@ -0,0 +1,24 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build !windows + +package cmd + +import ( + "os/exec" + "syscall" + + "github.com/elastic/elastic-agent/pkg/utils" +) + +func enrollCmdExtras(cmd *exec.Cmd, ownership utils.FileOwner) error { + cmd.SysProcAttr = &syscall.SysProcAttr{ + Credential: &syscall.Credential{ + Uid: uint32(ownership.UID), + Gid: uint32(ownership.GID), + }, + } + return nil +} diff --git a/internal/pkg/agent/cmd/install_enroll_windows.go b/internal/pkg/agent/cmd/install_enroll_windows.go new file mode 100644 index 00000000000..8178950fc0f --- /dev/null +++ b/internal/pkg/agent/cmd/install_enroll_windows.go @@ -0,0 +1,18 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build windows + +package cmd + +import ( + "os/exec" + + "github.com/elastic/elastic-agent/pkg/utils" +) + +func enrollCmdExtras(cmd *exec.Cmd, ownership utils.FileOwner) error { + // TODO: Add ability to call enroll as non-Administrator on Windows. + return nil +} diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 4a12bdc8540..a1162889ba3 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -49,6 +49,7 @@ import ( "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/control/v2/server" "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/pkg/utils" "github.com/elastic/elastic-agent/version" ) @@ -612,7 +613,7 @@ func ensureInstallMarkerPresent() error { // Otherwise, we're being upgraded from a version of an installed Agent // that didn't use an installation marker file (that is, before v8.8.0). // So create the file now. - if err := info.CreateInstallMarker(paths.Top()); err != nil { + if err := info.CreateInstallMarker(paths.Top(), utils.CurrentFileOwner()); err != nil { return fmt.Errorf("unable to create installation marker file during upgrade: %w", err) } diff --git a/internal/pkg/agent/install/install.go b/internal/pkg/agent/install/install.go index 7c3ee14b038..4152479fb11 100644 --- a/internal/pkg/agent/install/install.go +++ b/internal/pkg/agent/install/install.go @@ -19,17 +19,21 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/cli" + "github.com/elastic/elastic-agent/pkg/utils" ) const ( darwin = "darwin" + + elasticUsername = "elastic-agent" + elasticGroupName = "elastic-agent" ) // Install installs Elastic Agent persistently on the system including creating and starting its service. -func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli.IOStreams) error { +func Install(cfgFile, topPath string, unprivileged bool, pt *progressbar.ProgressBar, streams *cli.IOStreams) (utils.FileOwner, error) { dir, err := findDirectory() if err != nil { - return errors.New(err, "failed to discover the source directory for installation", errors.TypeFilesystem) + return utils.FileOwner{}, errors.New(err, "failed to discover the source directory for installation", errors.TypeFilesystem) } // We only uninstall Agent if it is currently installed. @@ -44,7 +48,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. err = Uninstall(cfgFile, topPath, "", pt) if err != nil { pt.Describe("Failed to uninstall current Elastic Agent") - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to uninstall Agent at (%s)", filepath.Dir(topPath)), errors.M("directory", filepath.Dir(topPath))) @@ -52,10 +56,53 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. pt.Describe("Successfully uninstalled current Elastic Agent") } + var ownership utils.FileOwner + username := "" + groupName := "" + if unprivileged { + username = elasticUsername + groupName = elasticGroupName + + // ensure required group + ownership.GID, err = FindGID(groupName) + if err != nil && !errors.Is(err, ErrGroupNotFound) { + return utils.FileOwner{}, fmt.Errorf("failed finding group %s: %w", groupName, err) + } + if errors.Is(err, ErrGroupNotFound) { + pt.Describe(fmt.Sprintf("Creating group %s", groupName)) + ownership.GID, err = CreateGroup(groupName) + if err != nil { + pt.Describe(fmt.Sprintf("Failed to create group %s", groupName)) + return utils.FileOwner{}, fmt.Errorf("failed to create group %s: %w", groupName, err) + } + pt.Describe(fmt.Sprintf("Successfully created group %s", groupName)) + } + + // ensure required user + ownership.UID, err = FindUID(username) + if err != nil && !errors.Is(err, ErrUserNotFound) { + return utils.FileOwner{}, fmt.Errorf("failed finding username %s: %w", username, err) + } + if errors.Is(err, ErrUserNotFound) { + pt.Describe(fmt.Sprintf("Creating user %s", username)) + ownership.UID, err = CreateUser(username, ownership.GID) + if err != nil { + pt.Describe(fmt.Sprintf("Failed to create user %s", username)) + return utils.FileOwner{}, fmt.Errorf("failed to create user %s: %w", username, err) + } + err = AddUserToGroup(username, groupName) + if err != nil { + pt.Describe(fmt.Sprintf("Failed to add user %s to group %s", username, groupName)) + return utils.FileOwner{}, fmt.Errorf("failed to add user %s to group %s: %w", username, groupName, err) + } + pt.Describe(fmt.Sprintf("Successfully created user %s", username)) + } + } + // ensure parent directory exists err = os.MkdirAll(filepath.Dir(topPath), 0755) if err != nil { - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to create installation parent directory (%s)", filepath.Dir(topPath)), errors.M("directory", filepath.Dir(topPath))) @@ -84,7 +131,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. }) if err != nil { pt.Describe("Error copying files") - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to copy source directory (%s) to destination (%s)", dir, topPath), errors.M("source", dir), errors.M("destination", topPath), @@ -97,7 +144,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. pathDir := filepath.Dir(paths.ShellWrapperPath) err = os.MkdirAll(pathDir, 0755) if err != nil { - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to create directory (%s) for shell wrapper (%s)", pathDir, paths.ShellWrapperPath), errors.M("directory", pathDir)) @@ -110,7 +157,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. // Check if previous shell wrapper or symlink exists and remove it so it can be overwritten if _, err := os.Lstat(paths.ShellWrapperPath); err == nil { if err := os.Remove(paths.ShellWrapperPath); err != nil { - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to remove (%s)", paths.ShellWrapperPath), errors.M("destination", paths.ShellWrapperPath)) @@ -118,7 +165,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. } err = os.Symlink(filepath.Join(topPath, paths.BinaryName), paths.ShellWrapperPath) if err != nil { - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to create elastic-agent symlink (%s)", paths.ShellWrapperPath), errors.M("destination", paths.ShellWrapperPath)) @@ -130,7 +177,7 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. shellWrapper := strings.Replace(paths.ShellWrapper, "%s", topPath, -1) err = os.WriteFile(paths.ShellWrapperPath, []byte(shellWrapper), 0755) if err != nil { - return errors.New( + return utils.FileOwner{}, errors.New( err, fmt.Sprintf("failed to write shell wrapper (%s)", paths.ShellWrapperPath), errors.M("destination", paths.ShellWrapperPath)) @@ -141,42 +188,57 @@ func Install(cfgFile, topPath string, pt *progressbar.ProgressBar, streams *cli. // post install (per platform) err = postInstall(topPath) if err != nil { - return fmt.Errorf("error running post-install steps: %w", err) + return ownership, fmt.Errorf("error running post-install steps: %w", err) } // fix permissions - err = FixPermissions(topPath) + err = FixPermissions(topPath, ownership) if err != nil { - return errors.New( - err, - "failed to perform permission changes", - errors.M("destination", topPath)) + return ownership, fmt.Errorf("failed to perform permission changes on path %s: %w", topPath, err) + } + if paths.ShellWrapperPath != "" { + err = FixPermissions(paths.ShellWrapperPath, ownership) + if err != nil { + return ownership, fmt.Errorf("failed to perform permission changes on path %s: %w", paths.ShellWrapperPath, err) + } + } + + // create socket path when installing as non-root + // now is the only time to do it while root is available (without doing this it will not be possible + // for the service to create the control socket) + // windows: uses npipe and doesn't need a directory created + if unprivileged { + err = createSocketDir(ownership) + if err != nil { + return ownership, fmt.Errorf("failed to create socket directory: %w", err) + } } // install service pt.Describe("Installing service") - svc, err := newService(topPath) + svc, err := newService(topPath, withUserGroup(username, groupName)) if err != nil { pt.Describe("Failed to install service") - return fmt.Errorf("error installing new service: %w", err) + return ownership, fmt.Errorf("error installing new service: %w", err) } err = svc.Install() if err != nil { pt.Describe("Failed to install service") - return errors.New( + return ownership, errors.New( err, fmt.Sprintf("failed to install service (%s)", paths.ServiceName), errors.M("service", paths.ServiceName)) } pt.Describe("Installed service") - return nil + return ownership, nil } // StartService starts the installed service. // // This should only be called after Install is successful. func StartService(topPath string) error { + // only starting the service, so no need to set the username and group to any value svc, err := newService(topPath) if err != nil { return fmt.Errorf("error creating new service handler: %w", err) @@ -193,6 +255,7 @@ func StartService(topPath string) error { // StopService stops the installed service. func StopService(topPath string) error { + // only stopping the service, so no need to set the username and group to any value svc, err := newService(topPath) if err != nil { return fmt.Errorf("error creating new service handler: %w", err) @@ -209,6 +272,7 @@ func StopService(topPath string) error { // RestartService restarts the installed service. func RestartService(topPath string) error { + // only restarting the service, so no need to set the username and group to any value svc, err := newService(topPath) if err != nil { return fmt.Errorf("error creating new service handler: %w", err) @@ -232,11 +296,6 @@ func StatusService(topPath string) (service.Status, error) { return svc.Status() } -// FixPermissions fixes the permissions on the installed system. -func FixPermissions(topPath string) error { - return fixPermissions(topPath) -} - // findDirectory returns the directory to copy into the installation location. // // This also verifies that the discovered directory is a valid directory for installation. diff --git a/internal/pkg/agent/install/install_unix.go b/internal/pkg/agent/install/install_unix.go index b6c7a100b4b..9840dd90ead 100644 --- a/internal/pkg/agent/install/install_unix.go +++ b/internal/pkg/agent/install/install_unix.go @@ -6,8 +6,38 @@ package install +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/pkg/utils" +) + // postInstall performs post installation for unix-based systems. func postInstall(topPath string) error { // do nothing return nil } + +// createSocketDir creates the socket directory. +func createSocketDir(ownership utils.FileOwner) error { + path := filepath.Dir(strings.TrimPrefix(paths.ControlSocketUnprivilegedPath, "unix://")) + err := os.MkdirAll(path, 0770) + if err != nil { + return fmt.Errorf("failed to create path %s: %w", path, err) + } + err = os.Chown(path, ownership.UID, ownership.GID) + if err != nil { + return fmt.Errorf("failed to chown path %s: %w", path, err) + } + // possible that the directory existed, still set the + // permission again to ensure that they are correct + err = os.Chmod(path, 0770) + if err != nil { + return fmt.Errorf("failed to chmod path %s: %w", path, err) + } + return nil +} diff --git a/internal/pkg/agent/install/install_windows.go b/internal/pkg/agent/install/install_windows.go index be8d2fb44e5..07b43591906 100644 --- a/internal/pkg/agent/install/install_windows.go +++ b/internal/pkg/agent/install/install_windows.go @@ -13,6 +13,7 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/release" + "github.com/elastic/elastic-agent/pkg/utils" "github.com/elastic/elastic-agent/version" ) @@ -44,3 +45,9 @@ func postInstall(topPath string) error { return nil } + +// createSocketDir creates the socket directory. +func createSocketDir(ownership utils.FileOwner) error { + // doesn't do anything on windows, no directory is needed. + return nil +} diff --git a/internal/pkg/agent/install/installed.go b/internal/pkg/agent/install/installed.go index c483dda7bb8..c415d4e1abe 100644 --- a/internal/pkg/agent/install/installed.go +++ b/internal/pkg/agent/install/installed.go @@ -55,6 +55,7 @@ func Status(topPath string) (StatusType, string) { // checkService only checks the status of the service. func checkService(topPath string) (StatusType, string) { + // only checking the service, so no need to set the username and group to any value svc, err := newService(topPath) if err != nil { return NotInstalled, "unable to check service status" diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index e84dcd5039c..578c2af0a30 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -11,18 +11,17 @@ import ( "io/fs" "os" "path/filepath" -) -// fixPermissions fixes the permissions so only root:root is the owner and no world read-able permissions -func fixPermissions(topPath string) error { - return recursiveRootPermissions(topPath) -} + "github.com/elastic/elastic-agent/pkg/utils" +) -func recursiveRootPermissions(path string) error { - return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { +// FixPermissions fixes the permissions so only root:root is the owner and no world read-able permissions +func FixPermissions(topPath string, ownership utils.FileOwner) error { + return filepath.Walk(topPath, func(name string, info fs.FileInfo, err error) error { if err == nil { - // all files should be owned by root:root - err = os.Chown(name, 0, 0) + // all files should be owned by uid:gid + // uses `os.Lchown` so the symlink is updated to have the permissions + err = os.Lchown(name, ownership.UID, ownership.GID) if err != nil { return err } diff --git a/internal/pkg/agent/install/perms_windows.go b/internal/pkg/agent/install/perms_windows.go index 45b0d073dc4..1281479f695 100644 --- a/internal/pkg/agent/install/perms_windows.go +++ b/internal/pkg/agent/install/perms_windows.go @@ -13,19 +13,17 @@ import ( "github.com/hectane/go-acl" "golang.org/x/sys/windows" -) -// fixPermissions fixes the permissions so only SYSTEM and Administrators have access to the files in the install path -func fixPermissions(topPath string) error { - return recursiveSystemAdminPermissions(topPath) -} + "github.com/elastic/elastic-agent/pkg/utils" +) -func recursiveSystemAdminPermissions(path string) error { - return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { +// FixPermissions fixes the permissions so only SYSTEM and Administrators have access to the files in the install path +func FixPermissions(topPath string, ownership utils.FileOwner) error { + return filepath.Walk(topPath, func(name string, info fs.FileInfo, err error) error { if err == nil { // first level doesn't inherit inherit := true - if path == name { + if topPath == name { inherit = false } err = systemAdministratorsOnly(name, inherit) @@ -38,11 +36,11 @@ func recursiveSystemAdminPermissions(path string) error { func systemAdministratorsOnly(path string, inherit bool) error { // https://support.microsoft.com/en-us/help/243330/well-known-security-identifiers-in-windows-operating-systems - systemSID, err := windows.StringToSid("S-1-5-18") + systemSID, err := windows.StringToSid(utils.SystemSID) if err != nil { return err } - administratorsSID, err := windows.StringToSid("S-1-5-32-544") + administratorsSID, err := windows.StringToSid(utils.AdministratorSID) if err != nil { return err } diff --git a/internal/pkg/agent/install/svc.go b/internal/pkg/agent/install/svc.go index 2fbc0c61803..080fd81fd0c 100644 --- a/internal/pkg/agent/install/svc.go +++ b/internal/pkg/agent/install/svc.go @@ -5,6 +5,7 @@ package install import ( + "fmt" "path/filepath" "runtime" @@ -36,14 +37,37 @@ func ExecutablePath(topPath string) string { return exec } -func newService(topPath string) (service.Service, error) { +type serviceOpts struct { + Username string + Group string +} + +type serviceOpt func(opts *serviceOpts) + +func withUserGroup(username string, group string) serviceOpt { + return func(opts *serviceOpts) { + opts.Username = username + opts.Group = group + } +} + +func newService(topPath string, opt ...serviceOpt) (service.Service, error) { + var opts serviceOpts + for _, o := range opt { + o(&opts) + } + cfg := &service.Config{ Name: paths.ServiceName, DisplayName: ServiceDisplayName, Description: ServiceDescription, Executable: ExecutablePath(topPath), WorkingDirectory: topPath, + UserName: opts.Username, Option: map[string]interface{}{ + // GroupName + "GroupName": opts.Group, + // Linux (systemd) always restart on failure "Restart": "always", @@ -74,6 +98,11 @@ func newService(topPath string) (service.Service, error) { // of the prebuilt template with added ExitTimeOut option cfg.Option["LaunchdConfig"] = darwinLaunchdConfig cfg.Option["ExitTimeOut"] = darwinServiceExitTimeout + + // Set the stdout and stderr logs to be inside the installation directory, ensures that the + // executing user for the service can write to the directory for the logs. + cfg.Option["StandardOutPath"] = filepath.Join(topPath, fmt.Sprintf("%s.out.log", paths.ServiceName)) + cfg.Option["StandardErrorPath"] = filepath.Join(topPath, fmt.Sprintf("%s.err.log", paths.ServiceName)) } return service.New(nil, cfg) @@ -97,6 +126,10 @@ const darwinLaunchdConfig = ` {{if .UserName}}UserName {{html .UserName}}{{end}} + {{if .Config.Option.GroupName -}} + GroupName + {{html .Config.Option.GroupName}} + {{- end}} {{if .ChRoot}}RootDirectory {{html .ChRoot}}{{end}} {{if .Config.Option.ExitTimeOut}}ExitTimeOut @@ -113,9 +146,9 @@ const darwinLaunchdConfig = ` StandardOutPath - /usr/local/var/log/{{html .Name}}.out.log + {{html .Config.Option.StandardOutPath}} StandardErrorPath - /usr/local/var/log/{{html .Name}}.err.log + {{html .Config.Option.StandardErrorPath}} @@ -136,6 +169,9 @@ ExecStart={{.Path|cmdEscape}}{{range .Arguments}} {{.|cmd}}{{end}} {{if .ChRoot}}RootDirectory={{.ChRoot|cmd}}{{end}} {{if .WorkingDirectory}}WorkingDirectory={{.WorkingDirectory|cmdEscape}}{{end}} {{if .UserName}}User={{.UserName}}{{end}} +{{if .Config.Option.GroupName -}} +Group={{.Config.Option.GroupName}} +{{- end}} {{if .ReloadSignal}}ExecReload=/bin/kill -{{.ReloadSignal}} "$MAINPID"{{end}} {{if .PIDFile}}PIDFile={{.PIDFile|cmd}}{{end}} {{if and .LogOutput .HasOutputFileSupport -}} diff --git a/internal/pkg/agent/install/uninstall.go b/internal/pkg/agent/install/uninstall.go index 4a453d3953c..8f2234d0f4d 100644 --- a/internal/pkg/agent/install/uninstall.go +++ b/internal/pkg/agent/install/uninstall.go @@ -35,6 +35,7 @@ import ( // Uninstall uninstalls persistently Elastic Agent on the system. func Uninstall(cfgFile, topPath, uninstallToken string, pt *progressbar.ProgressBar) error { // uninstall the current service + // not creating the service, so no need to set the username and group to any value svc, err := newService(topPath) if err != nil { return fmt.Errorf("error creating new service handler: %w", err) diff --git a/internal/pkg/agent/install/user.go b/internal/pkg/agent/install/user.go new file mode 100644 index 00000000000..96e924c9878 --- /dev/null +++ b/internal/pkg/agent/install/user.go @@ -0,0 +1,14 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package install + +import "errors" + +var ( + // ErrGroupNotFound returned when group is not found. + ErrGroupNotFound = errors.New("group not found") + // ErrUserNotFound returned when user is not found. + ErrUserNotFound = errors.New("user not found") +) diff --git a/internal/pkg/agent/install/user_darwin.go b/internal/pkg/agent/install/user_darwin.go new file mode 100644 index 00000000000..b6fa145c6e3 --- /dev/null +++ b/internal/pkg/agent/install/user_darwin.go @@ -0,0 +1,179 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build darwin + +package install + +import ( + "bufio" + "bytes" + "fmt" + "os/exec" + "sort" + "strconv" + "strings" +) + +// FindGID returns the group's GID on the machine. +func FindGID(name string) (int, error) { + records, err := dsclList("/Groups", "PrimaryGroupID") + if err != nil { + return -1, fmt.Errorf("failed listing: %w", err) + } + for _, record := range records { + if record[0] == name { + val, err := strconv.Atoi(record[1]) + if err != nil { + return -1, fmt.Errorf("failed to convert %s to int: %w", record[1], err) + } + return val, nil + } + } + return -1, ErrGroupNotFound +} + +// CreateGroup creates a group on the machine. +func CreateGroup(name string) (int, error) { + // find the next available ID + nextId, err := dsclNextID("/Groups", "PrimaryGroupID") + if err != nil { + return -1, fmt.Errorf("failed getting next gid: %w", err) + } + path := fmt.Sprintf("/Groups/%s", name) + + // create the group entry + err = dsclExec("-create", path, "PrimaryGroupID", strconv.Itoa(nextId)) + if err != nil { + return -1, err + } + + return nextId, nil +} + +// FindUID returns the user's UID on the machine. +func FindUID(name string) (int, error) { + records, err := dsclList("/Users", "UniqueID") + if err != nil { + return -1, fmt.Errorf("failed listing: %w", err) + } + for _, record := range records { + if record[0] == name { + val, err := strconv.Atoi(record[1]) + if err != nil { + return -1, fmt.Errorf("failed to convert %s to int: %w", record[1], err) + } + return val, nil + } + } + return -1, ErrUserNotFound +} + +// CreateUser creates a user on the machine. +func CreateUser(name string, gid int) (int, error) { + // find the next available ID + nextId, err := dsclNextID("/Users", "UniqueID") + if err != nil { + return -1, fmt.Errorf("failed getting next uid: %w", err) + } + path := fmt.Sprintf("/Users/%s", name) + + // create the user entry + err = dsclExec("-create", path, "UniqueID", strconv.Itoa(nextId)) + if err != nil { + return -1, err + } + + // set primary group to gid + err = dsclExec("-create", path, "PrimaryGroupID", strconv.Itoa(gid)) + if err != nil { + return -1, err + } + + // set home directory to empty + err = dsclExec("-create", path, "NFSHomeDirectory", "/var/empty") + if err != nil { + return -1, err + } + + // set to no shell + err = dsclExec("-create", path, "UserShell", "/usr/bin/false") + if err != nil { + return -1, err + } + + // set to no password (aka. cannot authenticate) + err = dsclExec("-create", path, "Password", "*") + if err != nil { + return -1, err + } + + return nextId, nil +} + +// AddUserToGroup adds a user to a group. +func AddUserToGroup(username string, groupName string) error { + // #nosec G204 -- user cannot set the groupName or username (hard coded in caller) + cmd := exec.Command("dscl", ".", "-append", fmt.Sprintf("/Groups/%s", groupName), "GroupMembership", username) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("dscl . -append failed: %w (output: %s)", err, output) + } + return nil +} + +func dsclNextID(path, field string) (int, error) { + records, err := dsclList(path, field) + if err != nil { + return -1, fmt.Errorf("failed listing: %w", err) + } + var ids []int + for _, record := range records { + id, err := strconv.Atoi(record[1]) + if err != nil { + return -1, fmt.Errorf("failed atoi for %s: %w", record[1], err) + } + ids = append(ids, id) + } + // largest id first + sort.Slice(ids, func(i, j int) bool { + return ids[j] < ids[i] + }) + if len(ids) == 0 { + // never going to happen, just be defensive + return 1, nil + } + return ids[0] + 1, nil +} + +func dsclList(path, field string) ([][]string, error) { + cmd := exec.Command("dscl", ".", "-list", path, field) + output, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("dscl . -list failed: %w", err) + } + var records [][]string + scanner := bufio.NewScanner(bytes.NewReader(output)) + scanner.Split(bufio.ScanLines) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + if len(fields) != 2 { + return nil, fmt.Errorf("got more than 2 fields reading line %q", line) + } + records = append(records, fields) + } + return records, nil +} + +func dsclExec(args ...string) error { + args = append([]string{"."}, args...) + cmd := exec.Command("dscl", args...) + output, err := cmd.CombinedOutput() + if err != nil { + command := fmt.Sprintf("dscl %s", strings.Join(args, " ")) + return fmt.Errorf("%s failed: %w (output: %s)", command, err, output) + } + return nil +} diff --git a/internal/pkg/agent/install/user_linux.go b/internal/pkg/agent/install/user_linux.go new file mode 100644 index 00000000000..0bb4ddee0d4 --- /dev/null +++ b/internal/pkg/agent/install/user_linux.go @@ -0,0 +1,94 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build linux + +package install + +import ( + "errors" + "fmt" + "os/exec" + "strconv" + "strings" +) + +// FindGID returns the group's GID on the machine. +func FindGID(name string) (int, error) { + id, err := getentGetID("group", name) + if e := (&exec.ExitError{}); errors.As(err, &e) { + if e.ExitCode() == 2 { + // exit code 2 is the key doesn't exist in the database + return -1, ErrGroupNotFound + } + } + return id, err +} + +// CreateGroup creates a group on the machine. +func CreateGroup(name string) (int, error) { + cmd := exec.Command("groupadd", "-f", name) + output, err := cmd.CombinedOutput() + if err != nil { + return -1, fmt.Errorf("groupadd -f %s failed: %w (output: %s)", name, err, output) + } + return FindGID(name) +} + +// FindUID returns the user's UID on the machine. +func FindUID(name string) (int, error) { + id, err := getentGetID("passwd", name) + if e := (&exec.ExitError{}); errors.As(err, &e) { + if e.ExitCode() == 2 { + // exit code 2 is the key doesn't exist in the database + return -1, ErrUserNotFound + } + } + return id, err +} + +// CreateUser creates a user on the machine. +func CreateUser(name string, gid int) (int, error) { + args := []string{ + "--gid", strconv.Itoa(gid), + "--system", + "--no-user-group", + "--shell", "/usr/bin/false", + name, + } + cmd := exec.Command("useradd", args...) + output, err := cmd.CombinedOutput() + if err != nil { + command := fmt.Sprintf("useradd %s", strings.Join(args, " ")) + return -1, fmt.Errorf("%s failed: %w (output: %s)", command, err, output) + } + return FindUID(name) +} + +// AddUserToGroup adds a user to a group. +func AddUserToGroup(username string, groupName string) error { + cmd := exec.Command("usermod", "-a", "-G", groupName, username) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("usermod -a -G %s %s failed: %w (output: %s)", groupName, username, err, output) + } + return nil +} + +func getentGetID(database string, key string) (int, error) { + cmd := exec.Command("getent", database, key) + output, err := cmd.Output() + if err != nil { + return -1, fmt.Errorf("getent %s %s failed: %w (output: %s)", database, key, err, output) + } + split := strings.Split(string(output), ":") + if len(split) < 3 { + return -1, fmt.Errorf("unexpected format: %s", output) + } + val, err := strconv.Atoi(split[2]) + if err != nil { + return -1, fmt.Errorf("failed to convert %s to int: %w", split[2], err) + } + return val, nil +} diff --git a/internal/pkg/agent/install/user_windows.go b/internal/pkg/agent/install/user_windows.go new file mode 100644 index 00000000000..0c68c74222f --- /dev/null +++ b/internal/pkg/agent/install/user_windows.go @@ -0,0 +1,34 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build windows + +package install + +import "errors" + +// FindGID returns the group's GID on the machine. +func FindGID(name string) (string, error) { + return "", errors.New("not implemented") +} + +// CreateGroup creates a group on the machine. +func CreateGroup(name string) (string, error) { + return "", errors.New("not implemented") +} + +// FindUID returns the user's UID on the machine. +func FindUID(name string) (string, error) { + return "", errors.New("not implemented") +} + +// CreateUser creates a user on the machine. +func CreateUser(name string, gid string) (string, error) { + return "", errors.New("not implemented") +} + +// AddUserToGroup adds a user to a group. +func AddUserToGroup(username string, groupName string) error { + return errors.New("not implemented") +} diff --git a/internal/pkg/agent/vault/vault_windows.go b/internal/pkg/agent/vault/vault_windows.go index afd484d351a..71f227aa6c0 100644 --- a/internal/pkg/agent/vault/vault_windows.go +++ b/internal/pkg/agent/vault/vault_windows.go @@ -12,6 +12,8 @@ import ( "github.com/billgraziano/dpapi" "github.com/hectane/go-acl" "golang.org/x/sys/windows" + + "github.com/elastic/elastic-agent/pkg/utils" ) func (v *Vault) encrypt(data []byte) ([]byte, error) { @@ -28,11 +30,11 @@ func tightenPermissions(path string) error { func systemAdministratorsOnly(path string, inherit bool) error { // https://support.microsoft.com/en-us/help/243330/well-known-security-identifiers-in-windows-operating-systems - systemSID, err := windows.StringToSid("S-1-5-18") + systemSID, err := windows.StringToSid(utils.SystemSID) if err != nil { return err } - administratorsSID, err := windows.StringToSid("S-1-5-32-544") + administratorsSID, err := windows.StringToSid(utils.AdministratorSID) if err != nil { return err } diff --git a/pkg/control/addr.go b/pkg/control/addr.go index d01cf074df9..916b771097d 100644 --- a/pkg/control/addr.go +++ b/pkg/control/addr.go @@ -13,13 +13,18 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/pkg/utils" ) // Address returns the address to connect to Elastic Agent daemon. func Address() string { // when installed the control address is fixed if info.RunningInstalled() { - return paths.ControlSocketPath + root, _ := utils.HasRoot() // error is ignored + if root { + return paths.ControlSocketPath + } + return paths.ControlSocketUnprivilegedPath } // unix socket path must be less than 104 characters diff --git a/pkg/control/v2/server/listener.go b/pkg/control/v2/server/listener.go index 54852956280..d8b2982f17c 100644 --- a/pkg/control/v2/server/listener.go +++ b/pkg/control/v2/server/listener.go @@ -13,10 +13,10 @@ import ( "path/filepath" "strings" - "github.com/elastic/elastic-agent/pkg/control" - "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + "github.com/elastic/elastic-agent/pkg/control" "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/pkg/utils" ) func createListener(log *logger.Logger) (net.Listener, error) { @@ -26,7 +26,7 @@ func createListener(log *logger.Logger) (net.Listener, error) { } dir := filepath.Dir(path) if _, err := os.Stat(dir); os.IsNotExist(err) { - err = os.MkdirAll(dir, 0755) + err = os.MkdirAll(dir, 0775) if err != nil { return nil, err } @@ -35,7 +35,13 @@ func createListener(log *logger.Logger) (net.Listener, error) { if err != nil { return nil, err } - err = os.Chmod(path, 0700) + mode := os.FileMode(0700) + root, _ := utils.HasRoot() // error ignored + if !root { + // allow group access when not running as root + mode = os.FileMode(0770) + } + err = os.Chmod(path, mode) if err != nil { // failed to set permissions (close listener) lis.Close() diff --git a/pkg/control/v2/server/listener_windows.go b/pkg/control/v2/server/listener_windows.go index 27e530a4c6a..6ce0898192a 100644 --- a/pkg/control/v2/server/listener_windows.go +++ b/pkg/control/v2/server/listener_windows.go @@ -7,21 +7,16 @@ package server import ( + "fmt" "net" "os/user" "strings" - "github.com/elastic/elastic-agent/pkg/control" - - "github.com/pkg/errors" - "github.com/elastic/elastic-agent-libs/api/npipe" - "github.com/elastic/elastic-agent/pkg/core/logger" -) -const ( - NTAUTHORITY_SYSTEM = "S-1-5-18" - ADMINISTRATORS_GROUP = "S-1-5-32-544" + "github.com/elastic/elastic-agent/pkg/control" + "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/pkg/utils" ) // createListener creates a named pipe listener on Windows @@ -40,7 +35,7 @@ func cleanupListener(_ *logger.Logger) { func securityDescriptor(log *logger.Logger) (string, error) { u, err := user.Current() if err != nil { - return "", errors.Wrap(err, "failed to get current user") + return "", fmt.Errorf("failed to get current user: %w", err) } // Named pipe security and access rights. // We create the pipe and the specific users should only be able to write to it. @@ -56,7 +51,7 @@ func securityDescriptor(log *logger.Logger) (string, error) { // running as SYSTEM, include Administrators group so Administrators can talk over // the named pipe to the running Elastic Agent system process // https://support.microsoft.com/en-us/help/243330/well-known-security-identifiers-in-windows-operating-systems - descriptor += "(A;;GA;;;" + ADMINISTRATORS_GROUP + ")" + descriptor += "(A;;GA;;;" + utils.AdministratorSID + ")" } return descriptor, nil } @@ -72,7 +67,7 @@ func isWindowsAdmin(u *user.User) (bool, error) { groups, err := u.GroupIds() if err != nil { - return false, errors.Wrap(err, "failed to get current user groups") + return false, fmt.Errorf("failed to get current user groups: %w", err) } for _, groupSid := range groups { @@ -85,5 +80,5 @@ func isWindowsAdmin(u *user.User) (bool, error) { } func equalsSystemGroup(s string) bool { - return strings.EqualFold(s, NTAUTHORITY_SYSTEM) || strings.EqualFold(s, ADMINISTRATORS_GROUP) + return strings.EqualFold(s, utils.SystemSID) || strings.EqualFold(s, utils.AdministratorSID) } diff --git a/pkg/core/logger/logger.go b/pkg/core/logger/logger.go index e82bfd323b0..b7598f9ef23 100644 --- a/pkg/core/logger/logger.go +++ b/pkg/core/logger/logger.go @@ -21,6 +21,7 @@ import ( "github.com/elastic/elastic-agent-libs/logp/configure" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + "github.com/elastic/elastic-agent/pkg/utils" ) const agentName = "elastic-agent" @@ -136,6 +137,12 @@ func DefaultLoggingConfig() *Config { cfg.Files.Path = paths.Logs() cfg.Files.Name = agentName cfg.Files.MaxSize = 20 * 1024 * 1024 + cfg.Files.Permissions = 0600 // default user only + root, _ := utils.HasRoot() // error ignored + if !root { + // when not running as root, the default changes to include the group + cfg.Files.Permissions = 0660 + } return &cfg } @@ -150,10 +157,16 @@ func MakeInternalFileOutput(cfg *Config) (zapcore.Core, error) { filename := filepath.Join(paths.Home(), DefaultLogDirectory, cfg.Beat) al := zap.NewAtomicLevelAt(cfg.Level.ZapLevel()) internalLevelEnabler = &al // directly persisting struct will panic on accessing unitialized backing pointer + permissions := 0600 // default user only + root, _ := utils.HasRoot() // error ignored + if !root { + // when not running as root, the default changes to include the group + permissions = 0660 + } rotator, err := file.NewFileRotator(filename, file.MaxSizeBytes(defaultCfg.Files.MaxSize), file.MaxBackups(defaultCfg.Files.MaxBackups), - file.Permissions(os.FileMode(defaultCfg.Files.Permissions)), + file.Permissions(os.FileMode(permissions)), file.Interval(defaultCfg.Files.Interval), file.RotateOnStartup(defaultCfg.Files.RotateOnStartup), file.RedirectStderr(defaultCfg.Files.RedirectStderr), diff --git a/pkg/testing/fixture_install.go b/pkg/testing/fixture_install.go index 9ec684a811b..923356779e9 100644 --- a/pkg/testing/fixture_install.go +++ b/pkg/testing/fixture_install.go @@ -57,6 +57,7 @@ type InstallOpts struct { Insecure bool // --insecure NonInteractive bool // --non-interactive ProxyURL string // --proxy-url + Unprivileged bool // --unprivileged EnrollOpts } @@ -78,6 +79,9 @@ func (i InstallOpts) toCmdArgs() []string { if i.ProxyURL != "" { args = append(args, "--proxy-url="+i.ProxyURL) } + if i.Unprivileged { + args = append(args, "--unprivileged") + } args = append(args, i.EnrollOpts.toCmdArgs()...) @@ -112,7 +116,11 @@ func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts .. } // we just installed agent, the control socket is at a well-known location - c := client.New(client.WithAddress(paths.ControlSocketPath)) + socketPath := paths.ControlSocketPath + if installOpts.Unprivileged { + socketPath = paths.ControlSocketUnprivilegedPath + } + c := client.New(client.WithAddress(socketPath)) f.setClient(c) f.t.Cleanup(func() { diff --git a/pkg/utils/perm_unix.go b/pkg/utils/perm_unix.go index 5c15c9a5e69..4adcfc178aa 100644 --- a/pkg/utils/perm_unix.go +++ b/pkg/utils/perm_unix.go @@ -11,6 +11,20 @@ import ( "os" ) +// FileOwner is the ownership a file should have. +type FileOwner struct { + UID int + GID int +} + +// CurrentFileOwner returns the executing UID and GID of the current process. +func CurrentFileOwner() FileOwner { + return FileOwner{ + UID: os.Getuid(), + GID: os.Getgid(), + } +} + // HasStrictExecPerms ensures that the path is executable by the owner, cannot be written by anyone other than the // owner of the file and that the owner of the file is the same as the UID or root. func HasStrictExecPerms(path string, uid int) error { diff --git a/pkg/utils/perm_windows.go b/pkg/utils/perm_windows.go index 5fd41d1928a..6b6ac0fd1b8 100644 --- a/pkg/utils/perm_windows.go +++ b/pkg/utils/perm_windows.go @@ -6,6 +6,28 @@ package utils +const ( + // AdministratorSID is the SID for the Administrator user. + AdministratorSID = "S-1-5-32-544" + // SystemSID is the SID for the SYSTEM user. + SystemSID = "S-1-5-32-544" +) + +// FileOwner is the ownership a file should have. +type FileOwner struct { + UID string + GID string +} + +// CurrentFileOwner returns the executing UID and GID of the current process. +func CurrentFileOwner() FileOwner { + // TODO(blakerouse): Make this return the current user and group on Windows. + return FileOwner{ + UID: AdministratorSID, + GID: SystemSID, + } +} + // HasStrictExecPerms ensures that the path is executable by the owner and that the owner of the file // is the same as the UID or root. func HasStrictExecPerms(path string, uid int) error { diff --git a/pkg/utils/root_unix.go b/pkg/utils/root_unix.go index cb264cc4bb0..765483bf45c 100644 --- a/pkg/utils/root_unix.go +++ b/pkg/utils/root_unix.go @@ -16,5 +16,5 @@ const ( // HasRoot returns true if the user has root permissions. // Added extra `nil` value to return since the HasRoot for windows will return an error as well func HasRoot() (bool, error) { - return os.Getegid() == 0, nil + return os.Geteuid() == 0, nil } diff --git a/testing/integration/install_unprivileged_test.go b/testing/integration/install_unprivileged_test.go new file mode 100644 index 00000000000..6c260a42de0 --- /dev/null +++ b/testing/integration/install_unprivileged_test.go @@ -0,0 +1,198 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration && !windows + +package integration + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/install" + atesting "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/define" +) + +func TestInstallUnprivilegedWithoutBasePath(t *testing.T) { + define.Require(t, define.Requirements{ + // We require sudo for this test to run + // `elastic-agent install` (even though it will + // be installed as non-root). + Sudo: true, + + // It's not safe to run this test locally as it + // installs Elastic Agent. + Local: false, + + // Only supports Linux at the moment. + OS: []define.OS{ + { + Type: define.Linux, + }, + }, + }) + + // Get path to Elastic Agent executable + fixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + // Prepare the Elastic Agent so the binary is extracted and ready to use. + err = fixture.Prepare(context.Background()) + require.NoError(t, err) + + // Check that default base path is clean + var defaultBasePath string + switch runtime.GOOS { + case "darwin": + defaultBasePath = `/Library` + case "linux": + defaultBasePath = `/opt` + case "windows": + defaultBasePath = `C:\Program Files` + } + + topPath := filepath.Join(defaultBasePath, "Elastic", "Agent") + err = os.RemoveAll(topPath) + require.NoError(t, err, "failed to remove %q. The test requires this path not to exist.") + + // Run `elastic-agent install`. We use `--force` to prevent interactive + // execution. + out, err := fixture.Install(context.Background(), &atesting.InstallOpts{Force: true, Unprivileged: true}) + if err != nil { + t.Logf("install output: %s", out) + require.NoError(t, err) + } + + checkInstallUnprivilegedSuccess(t, topPath) +} + +func TestInstallUnprivilegedWithBasePath(t *testing.T) { + define.Require(t, define.Requirements{ + // We require sudo for this test to run + // `elastic-agent install` (even though it will + // be installed as non-root). + Sudo: true, + + // It's not safe to run this test locally as it + // installs Elastic Agent. + Local: false, + + // Only supports Linux at the moment. + OS: []define.OS{ + { + Type: define.Linux, + }, + }, + }) + + // Get path to Elastic Agent executable + fixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + // Prepare the Elastic Agent so the binary is extracted and ready to use. + err = fixture.Prepare(context.Background()) + require.NoError(t, err) + + // Other test `TestInstallWithBasePath` uses a random directory for the base + // path and that works because its running root. When using a base path the + // base needs to be accessible by the `elastic-agent` user that will be + // executing the process, but is not created yet. Using a base that exists + // and is known to be accessible by standard users, ensures this tests + // works correctly and will not hit a permission issue when spawning the + // elastic-agent service. + var basePath string + switch runtime.GOOS { + case define.Linux: + // default is `/opt` + basePath = `/usr` + default: + t.Fatalf("only Linux is supported by this test; should have been skipped") + } + + // Run `elastic-agent install`. We use `--force` to prevent interactive + // execution. + out, err := fixture.Install(context.Background(), &atesting.InstallOpts{ + BasePath: basePath, + Force: true, + Unprivileged: true, + }) + if err != nil { + t.Logf("install output: %s", out) + require.NoError(t, err) + } + + // Check that Agent was installed in the custom base path + topPath := filepath.Join(basePath, "Elastic", "Agent") + checkInstallUnprivilegedSuccess(t, topPath) +} + +func checkInstallUnprivilegedSuccess(t *testing.T, topPath string) { + t.Helper() + + // Check that the elastic-agent user/group exist. + uid, err := install.FindUID("elastic-agent") + require.NoError(t, err) + gid, err := install.FindGID("elastic-agent") + require.NoError(t, err) + + // Path should now exist as well as be owned by the correct user/group. + info, err := os.Stat(topPath) + require.NoError(t, err) + fs, ok := info.Sys().(*syscall.Stat_t) + require.True(t, ok) + require.Equalf(t, fs.Uid, uint32(uid), "%s not owned by elastic-agent user", topPath) + require.Equalf(t, fs.Gid, uint32(gid), "%s not owned by elastic-agent group", topPath) + + // Check that a few expected installed files are present + installedBinPath := filepath.Join(topPath, exeOnWindows("elastic-agent")) + installedDataPath := filepath.Join(topPath, "data") + installMarkerPath := filepath.Join(topPath, ".installed") + _, err = os.Stat(installedBinPath) + require.NoError(t, err) + _, err = os.Stat(installedDataPath) + require.NoError(t, err) + _, err = os.Stat(installMarkerPath) + require.NoError(t, err) + + // Check that the socket is created with the correct permissions. + socketPath := strings.TrimPrefix(paths.ControlSocketUnprivilegedPath, "unix://") + require.Eventuallyf(t, func() bool { + _, err = os.Stat(socketPath) + return err == nil + }, 3*time.Minute, 1*time.Second, "%s socket never created: %s", socketPath, err) + info, err = os.Stat(socketPath) + require.NoError(t, err) + fs, ok = info.Sys().(*syscall.Stat_t) + require.True(t, ok) + require.Equalf(t, fs.Uid, uint32(uid), "%s not owned by elastic-agent user", socketPath) + require.Equalf(t, fs.Gid, uint32(gid), "%s not owned by elastic-agent group", socketPath) + + // Executing `elastic-agent status` as the `elastic-agent` user should work. + var output []byte + require.Eventuallyf(t, func() bool { + cmd := exec.Command("sudo", "-u", "elastic-agent", "elastic-agent", "status") + output, err = cmd.CombinedOutput() + return err == nil + }, 3*time.Minute, 1*time.Second, "status never successful: %s (output: %s)", err, output) + + // Executing `elastic-agent status` as the original user should fail, because that + // user is not in the 'elastic-agent' group. + originalUser := os.Getenv("USER") + if originalUser != "" { + cmd := exec.Command("sudo", "-u", originalUser, "elastic-agent", "status") + output, err := cmd.CombinedOutput() + require.Error(t, err, "running elastic-agent status should have failed: %s", output) + } +} From 029a65c2ab0cc0873b1e84d3ee8cd30183d43b00 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Fri, 10 Nov 2023 17:07:37 +0100 Subject: [PATCH 05/12] [CI] Buildkite migration for Windows 10/11 (#3700) * Windows 10/11 * Windows 10/11 * Pipeline grouping * Removed Extended window step from Jenkinsfile * Reduced the ubuntu ARM machine * Run extended windows tests on PR basis * Run extended windows tests on PR basis --- .buildkite/pipeline.yml | 221 +++++++++++++++++++++++----------------- .ci/Jenkinsfile | 62 +---------- 2 files changed, 130 insertions(+), 153 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index fc1ff11871a..ebbefb80898 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -4,97 +4,136 @@ env: VAULT_PATH: "kv/ci-shared/observability-ingest/cloud/gcp" DOCKER_REGISTRY: "docker.elastic.co" steps: - - label: "Unit tests - Ubuntu 22.04" - key: "unit-tests-2204" - command: ".buildkite/scripts/steps/unit-tests.sh" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: "gcp" - image: "family/core-ubuntu-2204" - retry: - manual: - allowed: true + - group: "Unit tests" + key: "unit-tests" + steps: + - label: "Unit tests - Ubuntu 22.04" + key: "unit-tests-2204" + command: ".buildkite/scripts/steps/unit-tests.sh" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + image: "family/core-ubuntu-2204" + retry: + manual: + allowed: true - - label: "Unit tests - Ubuntu 22.04 ARM64" - key: "unit-tests-2204-arm64" - command: ".buildkite/scripts/steps/unit-tests.sh" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: "aws" - imagePrefix: "core-ubuntu-2204-aarch64" - diskSizeGb: 200 - instanceType: "m6g.4xlarge" - retry: - manual: - allowed: true + - label: "Unit tests - Ubuntu 22.04 ARM64" + key: "unit-tests-2204-arm64" + command: ".buildkite/scripts/steps/unit-tests.sh" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "aws" + imagePrefix: "core-ubuntu-2204-aarch64" + diskSizeGb: 200 + instanceType: "m6g.xlarge" + retry: + manual: + allowed: true - - label: "Unit tests - Windows 2022" - key: "unit-tests-win2022" - command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: "gcp" - image: "family/core-windows-2022" - machine_type: "n2-standard-8" - disk_size: 200 - disk_type: "pd-ssd" - retry: - manual: - allowed: true + - label: "Unit tests - Windows 2022" + key: "unit-tests-win2022" + command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + image: "family/core-windows-2022" + machine_type: "n2-standard-8" + disk_size: 200 + disk_type: "pd-ssd" + retry: + manual: + allowed: true - - label: "Unit tests - Windows 2016" - key: "unit-tests-win2016" - command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: "gcp" - image: "family/core-windows-2016" - machine_type: "n2-standard-8" - disk_size: 200 - disk_type: "pd-ssd" - retry: - manual: - allowed: true + - label: "Unit tests - Windows 2016" + key: "unit-tests-win2016" + command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + image: "family/core-windows-2016" + machine_type: "n2-standard-8" + disk_size: 200 + disk_type: "pd-ssd" + retry: + manual: + allowed: true - - label: "Unit tests - MacOS 13 ARM" - key: "unit-tests-macos-13-arm" - command: ".buildkite/scripts/steps/unit-tests.sh" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: orka - imagePrefix: generic-13-ventura-arm - retry: - manual: - allowed: true + - label: "Unit tests - MacOS 13 ARM" + key: "unit-tests-macos-13-arm" + command: ".buildkite/scripts/steps/unit-tests.sh" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: orka + imagePrefix: generic-13-ventura-arm + retry: + manual: + allowed: true - - label: "Unit tests - MacOS 13" - key: "unit-tests-macos-13" - command: ".buildkite/scripts/steps/unit-tests.sh" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" - agents: - provider: orka - imagePrefix: generic-13-ventura-x64 - retry: - manual: - allowed: true + - label: "Unit tests - MacOS 13" + key: "unit-tests-macos-13" + command: ".buildkite/scripts/steps/unit-tests.sh" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: orka + imagePrefix: generic-13-ventura-x64 + retry: + manual: + allowed: true + + - group: "Desktop Windows tests" + key: "extended-windows" + steps: + - label: "Unit tests - Windows 10" + key: "unit-tests-win10" + command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + # TODO create own image + image: "family/endpoint-windows-10-tester-rel" + machine_type: "n2-standard-8" + disk_type: "pd-ssd" + retry: + manual: + allowed: true + - label: "Unit tests - Windows 11" + key: "unit-tests-win11" + command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + - "coverage.out" + agents: + provider: "gcp" + # TODO create own image + image: "family/endpoint-windows-11-tester-rel" + machine_type: "n2-standard-8" + disk_type: "pd-ssd" + retry: + manual: + allowed: true - label: "Merge coverage reports" key: "merge-coverage" @@ -108,18 +147,16 @@ steps: unit-tests-win2022 unit-tests-macos-13 unit-tests-macos-13-arm + unit-tests-win10 + unit-tests-win11 " artifact_paths: - "build/TEST-**" agents: image: "golang:1.20.10" depends_on: - - unit-tests-2204 - - unit-tests-2204-arm64 - - unit-tests-win2022 - - unit-tests-win2016 - - unit-tests-macos-13 - - unit-tests-macos-13-arm + - unit-tests + - extended-windows allow_dependency_failure: true - group: "K8s tests" diff --git a/.ci/Jenkinsfile b/.ci/Jenkinsfile index 870e9684742..23f3ca1e707 100644 --- a/.ci/Jenkinsfile +++ b/.ci/Jenkinsfile @@ -186,67 +186,7 @@ pipeline { } } } - } - stage('extended windows') { - when { - // Always when running builds on branches/tags - // Enable if extended windows support related changes. - beforeAgent true - anyOf { - not { changeRequest() } - expression { return isExtendedWindowsEnabled() && env.ONLY_DOCS == "false"} - } - } - failFast false - matrix { - agent {label "${PLATFORM} && windows-immutable"} - options { skipDefaultCheckout() } - axes { - axis { - name 'PLATFORM' - values 'windows-8', 'windows-10', 'windows-11' - } - } - stages { - stage('build'){ - options { skipDefaultCheckout() } - steps { - withGithubNotify(context: "Build-${PLATFORM}") { - deleteDir() - unstashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}") - withMageEnv(){ - dir("${BASE_DIR}"){ - cmd(label: 'Go build', script: 'mage build') - } - } - } - } - } - stage('Test') { - options { skipDefaultCheckout() } - steps { - withGithubNotify(context: "Test-${PLATFORM}") { - withMageEnv(){ - dir("${BASE_DIR}"){ - withEnv(["RACE_DETECTOR=true", "TEST_COVERAGE=${isCodeCoverageEnabled()}"]) { - cmd(label: 'Go unitTest', script: 'mage unitTest') - } - } - } - } - } - post { - always { - junit(allowEmptyResults: true, keepLongStdio: true, testResults: "${BASE_DIR}/build/TEST-*.xml") - whenTrue(isCodeCoverageEnabled()) { - coverageReport(baseDir: "**/build", reportFiles: 'TEST-go-unit.html', coverageFiles: 'TEST-go-unit-cov.xml') - } - } - } - } - } - } - } + } } post { cleanup { From c00eddf8e7bef973478eda5523d0c383895b809f Mon Sep 17 00:00:00 2001 From: Pierre HILBERT Date: Fri, 10 Nov 2023 17:33:29 +0100 Subject: [PATCH 06/12] Remove duplicate tags (#3740) * Removee duplicate tags * Adding changelog entry --- .../fragments/1697662209-duplicate-tags.yaml | 32 +++++++++++++++++++ internal/pkg/agent/cmd/enroll_cmd.go | 7 +++- internal/pkg/agent/cmd/enroll_cmd_test.go | 7 ++-- 3 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 changelog/fragments/1697662209-duplicate-tags.yaml diff --git a/changelog/fragments/1697662209-duplicate-tags.yaml b/changelog/fragments/1697662209-duplicate-tags.yaml new file mode 100644 index 00000000000..710cc775366 --- /dev/null +++ b/changelog/fragments/1697662209-duplicate-tags.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Remove duplicated tags when specified during the Agent enrollment. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: "" + +# Affected component; a word indicating the component this changeset affects. +component: agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3740 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/858 diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 055807ef16c..37540ae5249 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -1066,10 +1066,15 @@ func expBackoffWithContext(ctx context.Context, init, max time.Duration) backoff func cleanTags(tags []string) []string { var r []string + // Create a map to store unique elements + seen := make(map[string]bool) for _, str := range tags { tag := strings.TrimSpace(str) if tag != "" { - r = append(r, tag) + if _, ok := seen[tag]; !ok { + seen[tag] = true + r = append(r, tag) + } } } return r diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 189ad7b6563..0b1e7d5d4ee 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -351,9 +351,9 @@ func TestValidateArgs(t *testing.T) { require.Contains(t, cleanedTags, "production") }) - t.Run("comma separated tags are cleaned", func(t *testing.T) { + t.Run("comma separated tags and duplicated tags are cleaned", func(t *testing.T) { cmd := newEnrollCommandWithArgs([]string{}, streams) - err := cmd.Flags().Set("tag", "windows, production") + err := cmd.Flags().Set("tag", "windows, production, windows") require.NoError(t, err) args := buildEnrollmentFlags(cmd, url, enrolmentToken) require.Contains(t, args, "--tag") @@ -362,6 +362,9 @@ func TestValidateArgs(t *testing.T) { cleanedTags := cleanTags(args) require.Contains(t, cleanedTags, "windows") require.Contains(t, cleanedTags, "production") + // Validate that we remove the duplicates + require.Equal(t, len(args), 10) + require.Equal(t, len(cleanedTags), 7) }) t.Run("valid tag and empty tag", func(t *testing.T) { From 3155ea9db05bc1ceacbbccc9d356f5922fa4cf87 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Fri, 10 Nov 2023 22:10:59 +0100 Subject: [PATCH 07/12] [CI] run MacOS 13 ARM unit tests only on main branch (#3742) --- .buildkite/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ebbefb80898..489a3202ee3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -71,9 +71,11 @@ steps: manual: allowed: true + # Runs inly on the main branch - label: "Unit tests - MacOS 13 ARM" key: "unit-tests-macos-13-arm" command: ".buildkite/scripts/steps/unit-tests.sh" + branches: main artifact_paths: - "build/TEST-**" - "build/diagnostics/*" @@ -146,7 +148,6 @@ steps: unit-tests-win2016 unit-tests-win2022 unit-tests-macos-13 - unit-tests-macos-13-arm unit-tests-win10 unit-tests-win11 " From bb5de530566a18813630b831bf9c4862fbb89777 Mon Sep 17 00:00:00 2001 From: Andrew Gizas Date: Mon, 13 Nov 2023 03:18:55 -0600 Subject: [PATCH 08/12] Adding config variables to leader_election provider (#3625) * Adding config variables to leader election process * Adding Debug message info * Adding Changelog fragment --- .../1699010487-leaderelectionconfig.yaml | 32 +++++++++++++++++++ .../kubernetesleaderelection/config.go | 8 +++++ .../kubernetes_leaderelection.go | 7 ++-- 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 changelog/fragments/1699010487-leaderelectionconfig.yaml diff --git a/changelog/fragments/1699010487-leaderelectionconfig.yaml b/changelog/fragments/1699010487-leaderelectionconfig.yaml new file mode 100644 index 00000000000..e965af360df --- /dev/null +++ b/changelog/fragments/1699010487-leaderelectionconfig.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Added Kubernetes leader_election provider configuration parameters + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/composable/providers/kubernetesleaderelection/config.go b/internal/pkg/composable/providers/kubernetesleaderelection/config.go index 7ccc2f9a799..7988fcd6fc2 100644 --- a/internal/pkg/composable/providers/kubernetesleaderelection/config.go +++ b/internal/pkg/composable/providers/kubernetesleaderelection/config.go @@ -13,10 +13,18 @@ type Config struct { // Name of the leaderelection lease LeaderLease string `config:"leader_lease"` + //Parameters to configure election process + LeaseDuration int `config:"leader_leaseduration"` + RenewDeadline int `config:"leader_renewdeadline"` + RetryPeriod int `config:"leader_retryperiod"` + KubeClientOptions kubernetes.KubeClientOptions `config:"kube_client_options"` } // InitDefaults initializes the default values for the config. func (c *Config) InitDefaults() { c.LeaderLease = "elastic-agent-cluster-leader" + c.LeaseDuration = 15 + c.RenewDeadline = 10 + c.RetryPeriod = 2 } diff --git a/internal/pkg/composable/providers/kubernetesleaderelection/kubernetes_leaderelection.go b/internal/pkg/composable/providers/kubernetesleaderelection/kubernetes_leaderelection.go index b1388005ab7..c32b57d78c8 100644 --- a/internal/pkg/composable/providers/kubernetesleaderelection/kubernetes_leaderelection.go +++ b/internal/pkg/composable/providers/kubernetesleaderelection/kubernetes_leaderelection.go @@ -83,12 +83,13 @@ func (p *contextProvider) Run(ctx context.Context, comm corecomp.ContextProvider }, }, ReleaseOnCancel: true, - LeaseDuration: 15 * time.Second, - RenewDeadline: 10 * time.Second, - RetryPeriod: 2 * time.Second, + LeaseDuration: time.Duration(p.config.LeaseDuration) * time.Second, + RenewDeadline: time.Duration(p.config.RenewDeadline) * time.Second, + RetryPeriod: time.Duration(p.config.RetryPeriod) * time.Second, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(ctx context.Context) { p.logger.Debugf("leader election lock GAINED, id %v", id) + p.logger.Debugf("leader configuration timings: LeaseDuration: %v , RenewDeadline: %v, RetryPeriod: %v", p.leaderElection.LeaseDuration, p.leaderElection.RenewDeadline, p.leaderElection.RetryPeriod) p.startLeading(comm) }, OnStoppedLeading: func() { From 755c8cbbba658c47be553519e6d4ce30beb96cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constan=C3=A7a=20Manteigas?= <113898685+constanca-m@users.noreply.github.com> Date: Mon, 13 Nov 2023 18:15:01 +0100 Subject: [PATCH 09/12] [EA standalone] Add state_namespace data stream by default to the manifests (#3750) * Add state_namespace data stream by default * Update package version to 1.52.0 * Update standalone --- ...-agent-standalone-daemonset-configmap.yaml | 23 +++++++++++++++---- ...nt-standalone-ksm-daemonset-configmap.yaml | 8 +++---- ...-standalone-ksm-statefulset-configmap.yaml | 12 +++++++++- ...nt-standalone-ksm-daemonset-configmap.yaml | 8 +++---- ...-standalone-ksm-statefulset-configmap.yaml | 12 +++++++++- .../elastic-agent-standalone-kubernetes.yaml | 23 +++++++++++++++---- ...-agent-standalone-daemonset-configmap.yaml | 23 +++++++++++++++---- ...nt-standalone-ksm-daemonset-configmap.yaml | 8 +++---- ...-standalone-ksm-statefulset-configmap.yaml | 12 +++++++++- 9 files changed, 102 insertions(+), 27 deletions(-) diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset-configmap.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset-configmap.yaml index 3b8dd9464a7..9e9517435fc 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset-configmap.yaml @@ -35,7 +35,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -132,6 +132,21 @@ data: # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # ssl.certificate_authorities: # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + - data_stream: + dataset: kubernetes.state_namespace + type: metrics + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'kube-state-metrics:8080' + period: 10s + # Openshift: + # if to access 'kube-state-metrics' are used third party tools, like kube-rbac-proxy or similar, that perform RBAC authorization + # and/or tls termination, then configuration below should be considered: + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # ssl.certificate_authorities: + # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt - data_stream: dataset: kubernetes.state_node type: metrics @@ -337,7 +352,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -365,7 +380,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -506,7 +521,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml index 4c2d09c9a25..8abe95421cb 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml @@ -36,7 +36,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -127,7 +127,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -152,7 +152,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -293,7 +293,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml index 2a97b864565..cf913c8ea7c 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml @@ -80,6 +80,16 @@ data: - 'localhost:8080' period: 10s bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + - data_stream: + type: metrics + dataset: kubernetes.state_namespace + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'localhost:8080' + period: 10s + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - data_stream: type: metrics dataset: kubernetes.state_node @@ -173,4 +183,4 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml index 4c2d09c9a25..8abe95421cb 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-daemonset-configmap.yaml @@ -36,7 +36,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -127,7 +127,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -152,7 +152,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -293,7 +293,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml index 2a97b864565..cf913c8ea7c 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-standalone/base/elastic-agent-standalone-ksm-statefulset-configmap.yaml @@ -80,6 +80,16 @@ data: - 'localhost:8080' period: 10s bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + - data_stream: + type: metrics + dataset: kubernetes.state_namespace + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'localhost:8080' + period: 10s + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - data_stream: type: metrics dataset: kubernetes.state_node @@ -173,4 +183,4 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 diff --git a/deploy/kubernetes/elastic-agent-standalone-kubernetes.yaml b/deploy/kubernetes/elastic-agent-standalone-kubernetes.yaml index 125936a37b3..1aee9d7a74c 100644 --- a/deploy/kubernetes/elastic-agent-standalone-kubernetes.yaml +++ b/deploy/kubernetes/elastic-agent-standalone-kubernetes.yaml @@ -35,7 +35,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -132,6 +132,21 @@ data: # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # ssl.certificate_authorities: # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + - data_stream: + dataset: kubernetes.state_namespace + type: metrics + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'kube-state-metrics:8080' + period: 10s + # Openshift: + # if to access 'kube-state-metrics' are used third party tools, like kube-rbac-proxy or similar, that perform RBAC authorization + # and/or tls termination, then configuration below should be considered: + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # ssl.certificate_authorities: + # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt - data_stream: dataset: kubernetes.state_node type: metrics @@ -337,7 +352,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -365,7 +380,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -506,7 +521,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-daemonset-configmap.yaml b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-daemonset-configmap.yaml index 3b8dd9464a7..9e9517435fc 100644 --- a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-daemonset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-daemonset-configmap.yaml @@ -35,7 +35,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -132,6 +132,21 @@ data: # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # ssl.certificate_authorities: # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + - data_stream: + dataset: kubernetes.state_namespace + type: metrics + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'kube-state-metrics:8080' + period: 10s + # Openshift: + # if to access 'kube-state-metrics' are used third party tools, like kube-rbac-proxy or similar, that perform RBAC authorization + # and/or tls termination, then configuration below should be considered: + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # ssl.certificate_authorities: + # - /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt - data_stream: dataset: kubernetes.state_node type: metrics @@ -337,7 +352,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -365,7 +380,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: @@ -506,7 +521,7 @@ data: meta: package: name: kubernetes - version: 1.29.2 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-daemonset-configmap.yaml b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-daemonset-configmap.yaml index 4c2d09c9a25..8abe95421cb 100644 --- a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-daemonset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-daemonset-configmap.yaml @@ -36,7 +36,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -127,7 +127,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -152,7 +152,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: @@ -293,7 +293,7 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 data_stream: namespace: default streams: diff --git a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-statefulset-configmap.yaml b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-statefulset-configmap.yaml index 2a97b864565..cf913c8ea7c 100644 --- a/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-statefulset-configmap.yaml +++ b/deploy/kubernetes/elastic-agent-standalone/elastic-agent-standalone-ksm-statefulset-configmap.yaml @@ -80,6 +80,16 @@ data: - 'localhost:8080' period: 10s bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + - data_stream: + type: metrics + dataset: kubernetes.state_namespace + metricsets: + - state_namespace + add_metadata: true + hosts: + - 'localhost:8080' + period: 10s + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - data_stream: type: metrics dataset: kubernetes.state_node @@ -173,4 +183,4 @@ data: meta: package: name: kubernetes - version: 1.42.0 + version: 1.52.0 From 697a1bc677e7939d606743ae141493a7e7406ab0 Mon Sep 17 00:00:00 2001 From: Davide Girardi <1390902+girodav@users.noreply.github.com> Date: Tue, 14 Nov 2023 08:26:35 +0000 Subject: [PATCH 10/12] Remove assetbeat as dependency (#3739) --- dev-tools/mage/manifest/manifest.go | 1 - magefile.go | 2 -- 2 files changed, 3 deletions(-) diff --git a/dev-tools/mage/manifest/manifest.go b/dev-tools/mage/manifest/manifest.go index a2c377d600c..62f82eb2c87 100644 --- a/dev-tools/mage/manifest/manifest.go +++ b/dev-tools/mage/manifest/manifest.go @@ -87,7 +87,6 @@ func DownloadComponentsFromManifest(manifest string, platforms []string, platfor "beats": {"auditbeat", "filebeat", "heartbeat", "metricbeat", "osquerybeat", "packetbeat"}, "cloud-defend": {"cloud-defend"}, "cloudbeat": {"cloudbeat"}, - "assetbeat": {"assetbeat"}, "elastic-agent-shipper": {"elastic-agent-shipper"}, "endpoint-dev": {"endpoint-security"}, "fleet-server": {"fleet-server"}, diff --git a/magefile.go b/magefile.go index 23394850ee3..892b5d65560 100644 --- a/magefile.go +++ b/magefile.go @@ -929,7 +929,6 @@ func packageAgent(platforms []string, packagingFn func()) { // https://artifacts-snapshot.elastic.co/endpoint-dev/latest/8.11.0-SNAPSHOT.json // https://artifacts-snapshot.elastic.co/fleet-server/latest/8.11.0-SNAPSHOT.json // https://artifacts-snapshot.elastic.co/prodfiler/latest/8.11.0-SNAPSHOT.json - // https://artifacts-snapshot.elastic.co/assetbeat/latest/8.11.0-SNAPSHOT.json externalBinaries := map[string]string{ "auditbeat": "beats", "filebeat": "beats", @@ -945,7 +944,6 @@ func packageAgent(platforms []string, packagingFn func()) { "pf-elastic-collector": "prodfiler", "pf-elastic-symbolizer": "prodfiler", "pf-host-agent": "prodfiler", - "assetbeat": "assetbeat", // only supporting linux/amd64 or linux/arm64 } // Only log fatal logs for logs produced using logrus. This is the global logger From b272a93bcf47105daa424e5ee8139c12c3234369 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Tue, 14 Nov 2023 05:04:18 -0500 Subject: [PATCH 11/12] Switch to CFT region and add more robust tracking and cleanup of stacks. (#3701) * Switch to CFT region and add more robust tracking and cleanup of stacks. * Fix tests. * Adjust integration tests. * Fix lint in servless provider. * Fix serverless. * Fix comment and typo. * Fix serverless. * More serverless fixes. * Fix check loop in serverless. * Fix lint. * Code review fixes. * Fix a few missed error messages. --- .buildkite/hooks/pre-command | 5 +- .buildkite/hooks/pre-exit | 6 +- .buildkite/pipeline.yml | 4 - magefile.go | 9 +- pkg/testing/ess/config.go | 7 +- pkg/testing/ess/provisioner.go | 149 +++++++-------- pkg/testing/ess/serverless.go | 2 +- pkg/testing/ess/serverless_provision.go | 236 +++++++++++++----------- pkg/testing/ess/serverless_test.go | 29 ++- pkg/testing/runner/provisioner.go | 16 +- pkg/testing/runner/runner.go | 185 +++++++++++++------ pkg/testing/runner/runner_test.go | 48 +++-- 12 files changed, 386 insertions(+), 310 deletions(-) diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command index 292aa6918c0..c8a44505148 100755 --- a/.buildkite/hooks/pre-command +++ b/.buildkite/hooks/pre-command @@ -17,8 +17,7 @@ DOCKER_REGISTRY="docker.elastic.co" DOCKER_REGISTRY_SECRET_PATH="kv/ci-shared/platform-ingest/docker_registry_prod" CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role" CI_GCP_OBS_PATH="kv/ci-shared/observability-ingest/cloud/gcp" -# CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa" -CI_ESS_STAGING_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-staging" +CI_ESS_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-prod" CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role" @@ -55,7 +54,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" == export TEST_INTEG_AUTH_GCP_SERVICE_TOKEN_FILE=$(realpath ./gcp.json) # ESS credentials - export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_STAGING_PATH}) + export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_PATH}) echo ${API_KEY_TOKEN} > ./apiKey export TEST_INTEG_AUTH_ESS_APIKEY_FILE=$(realpath ./apiKey) fi diff --git a/.buildkite/hooks/pre-exit b/.buildkite/hooks/pre-exit index 4d0da50cf73..213f51aff7b 100755 --- a/.buildkite/hooks/pre-exit +++ b/.buildkite/hooks/pre-exit @@ -10,7 +10,11 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" == # Perform cleanup of integration tests resources echo "--- Cleaning up integration test resources" - TEST_INTEG_AUTH_ESS_REGION=us-east-1 SNAPSHOT=true mage integration:clean + if [[ "$BUILDKITE_STEP_KEY" == "serverless-integration-tests" ]]; then + STACK_PROVISIONER=serverless SNAPSHOT=true mage integration:clean + else + SNAPSHOT=true mage integration:clean + fi fi if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ]; then diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 489a3202ee3..866dc91b367 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -201,8 +201,6 @@ steps: - label: "Serverless integration test" key: "serverless-integration-tests" - env: - TEST_INTEG_AUTH_ESS_REGION: us-east-1 command: ".buildkite/scripts/steps/integration_tests.sh serverless integration:single TestLogIngestionFleetManaged" #right now, run a single test in serverless mode as a sort of smoke test, instead of re-running the entire suite artifact_paths: - "build/TEST-**" @@ -213,8 +211,6 @@ steps: - label: "Integration tests" key: "integration-tests" - env: - TEST_INTEG_AUTH_ESS_REGION: us-east-1 command: ".buildkite/scripts/steps/integration_tests.sh stateful" artifact_paths: - "build/TEST-**" diff --git a/magefile.go b/magefile.go index 892b5d65560..c6a73d147c3 100644 --- a/magefile.go +++ b/magefile.go @@ -1750,15 +1750,16 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche } datacenter := os.Getenv("TEST_INTEG_AUTH_GCP_DATACENTER") if datacenter == "" { + // us-central1-a is used because T2A instances required for ARM64 testing are only + // available in the central regions datacenter = "us-central1-a" } - // Valid values are gcp-us-central1 (default), azure-eastus2, - // aws-eu-central-1, us-east-1 (which is an AWS region but the - // "aws" CSP prefix is not used by ESS for some reason!) + // Possible to change the region for deployment, default is gcp-us-west2 which is + // the CFT region. essRegion := os.Getenv("TEST_INTEG_AUTH_ESS_REGION") if essRegion == "" { - essRegion = "gcp-us-central1" + essRegion = "gcp-us-west2" } instanceProvisionerMode := os.Getenv("INSTANCE_PROVISIONER") diff --git a/pkg/testing/ess/config.go b/pkg/testing/ess/config.go index c90be94caa5..62ece1be1ef 100644 --- a/pkg/testing/ess/config.go +++ b/pkg/testing/ess/config.go @@ -17,8 +17,13 @@ type Config struct { } func defaultConfig() *Config { + baseURL := os.Getenv("TEST_INTEG_AUTH_ESS_URL") + if baseURL == "" { + baseURL = "https://cloud.elastic.co" + } + url := strings.TrimRight(baseURL, "/") + "/api/v1" return &Config{ - BaseUrl: `https://staging.found.no/api/v1`, + BaseUrl: url, } } diff --git a/pkg/testing/ess/provisioner.go b/pkg/testing/ess/provisioner.go index a051cac39d1..47e8d9dcba2 100644 --- a/pkg/testing/ess/provisioner.go +++ b/pkg/testing/ess/provisioner.go @@ -11,8 +11,6 @@ import ( "strings" "time" - "golang.org/x/sync/errgroup" - "github.com/elastic/elastic-agent/pkg/testing/runner" ) @@ -62,89 +60,77 @@ func (p *provisioner) SetLogger(l runner.Logger) { p.logger = l } -func (p *provisioner) Provision(ctx context.Context, requests []runner.StackRequest) ([]runner.Stack, error) { - results := make(map[runner.StackRequest]*CreateDeploymentResponse) - for _, r := range requests { - // allow up to 2 minutes for each create request - createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute) - resp, err := p.createDeployment(createCtx, r, - map[string]string{ - "division": "engineering", - "org": "ingest", - "team": "elastic-agent", - "project": "elastic-agent", - "integration-tests": "true", - }) - createCancel() - if err != nil { - return nil, err - } - results[r] = resp - } +// Create creates a stack. +func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { + // allow up to 2 minutes for request + createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute) + defer createCancel() + resp, err := p.createDeployment(createCtx, request, + map[string]string{ + "division": "engineering", + "org": "ingest", + "team": "elastic-agent", + "project": "elastic-agent", + "integration-tests": "true", + }) + if err != nil { + return runner.Stack{}, err + } + return runner.Stack{ + ID: request.ID, + Version: request.Version, + Elasticsearch: resp.ElasticsearchEndpoint, + Kibana: resp.KibanaEndpoint, + Username: resp.Username, + Password: resp.Password, + Internal: map[string]interface{}{ + "deployment_id": resp.ID, + }, + Ready: false, + }, nil +} - // set a long timeout - // this context travels up to the magefile, clients that want a shorter timeout can set - // it via mage's -t flag - readyCtx, readyCancel := context.WithTimeout(ctx, 25*time.Minute) - defer readyCancel() - - g, gCtx := errgroup.WithContext(readyCtx) - for req, resp := range results { - g.Go(func(req runner.StackRequest, resp *CreateDeploymentResponse) func() error { - return func() error { - ready, err := p.client.DeploymentIsReady(gCtx, resp.ID, 30*time.Second) - if err != nil { - return fmt.Errorf("failed to check for cloud %s to be ready: %w", req.Version, err) - } - if !ready { - return fmt.Errorf("cloud %s never became ready: %w", req.Version, err) - } - return nil - } - }(req, resp)) +// WaitForReady should block until the stack is ready or the context is cancelled. +func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { + deploymentID, err := p.getDeploymentID(stack) + if err != nil { + return stack, fmt.Errorf("failed to get deployment ID from the stack: %w", err) } - err := g.Wait() + // allow up to 10 minutes for it to become ready + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + p.logger.Logf("Waiting for cloud stack %s to be ready [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID) + ready, err := p.client.DeploymentIsReady(ctx, deploymentID, 30*time.Second) if err != nil { - return nil, err + return stack, fmt.Errorf("failed to check for cloud %s [stack_id: %s, deployment_id: %s] to be ready: %w", stack.Version, stack.ID, deploymentID, err) } - - var stacks []runner.Stack - for req, resp := range results { - stacks = append(stacks, runner.Stack{ - ID: req.ID, - Version: req.Version, - Elasticsearch: resp.ElasticsearchEndpoint, - Kibana: resp.KibanaEndpoint, - Username: resp.Username, - Password: resp.Password, - Internal: map[string]interface{}{ - "deployment_id": resp.ID, - }, - }) + if !ready { + return stack, fmt.Errorf("cloud %s [stack_id: %s, deployment_id: %s] never became ready: %w", stack.Version, stack.ID, deploymentID, err) } - return stacks, nil + stack.Ready = true + return stack, nil } -// Clean cleans up all provisioned resources. -func (p *provisioner) Clean(ctx context.Context, stacks []runner.Stack) error { - var errs []error - for _, s := range stacks { - err := p.destroyDeployment(ctx, s) - if err != nil { - errs = append(errs, fmt.Errorf("failed to destroy stack %s (%s): %w", s.Version, s.ID, err)) - } - } - if len(errs) > 0 { - return errors.Join(errs...) +// Delete deletes a stack. +func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error { + deploymentID, err := p.getDeploymentID(stack) + if err != nil { + return err } - return nil + + // allow up to 1 minute for request + ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) + defer cancel() + + p.logger.Logf("Destroying cloud stack %s [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID) + return p.client.ShutdownDeployment(ctx, deploymentID) } func (p *provisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) { ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) defer cancel() - p.logger.Logf("Creating stack %s (%s)", r.Version, r.ID) + p.logger.Logf("Creating cloud stack %s [stack_id: %s]", r.Version, r.ID) name := fmt.Sprintf("%s-%s", strings.Replace(p.cfg.Identifier, ".", "-", -1), r.ID) // prepare tags @@ -168,26 +154,21 @@ func (p *provisioner) createDeployment(ctx context.Context, r runner.StackReques p.logger.Logf("Failed to create ESS cloud %s: %s", r.Version, err) return nil, fmt.Errorf("failed to create ESS cloud for version %s: %w", r.Version, err) } - p.logger.Logf("Created stack %s (%s) [id: %s]", r.Version, r.ID, resp.ID) + p.logger.Logf("Created cloud stack %s [stack_id: %s, deployment_id: %s]", r.Version, r.ID, resp.ID) return resp, nil } -func (p *provisioner) destroyDeployment(ctx context.Context, s runner.Stack) error { - if s.Internal == nil { - return fmt.Errorf("missing internal information") +func (p *provisioner) getDeploymentID(stack runner.Stack) (string, error) { + if stack.Internal == nil { + return "", fmt.Errorf("missing internal information") } - deploymentIDRaw, ok := s.Internal["deployment_id"] + deploymentIDRaw, ok := stack.Internal["deployment_id"] if !ok { - return fmt.Errorf("missing internal deployment_id") + return "", fmt.Errorf("missing internal deployment_id") } deploymentID, ok := deploymentIDRaw.(string) if !ok { - return fmt.Errorf("internal deployment_id not a string") + return "", fmt.Errorf("internal deployment_id not a string") } - - ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) - defer cancel() - - p.logger.Logf("Destroying stack %s (%s)", s.Version, s.ID) - return p.client.ShutdownDeployment(ctx, deploymentID) + return deploymentID, nil } diff --git a/pkg/testing/ess/serverless.go b/pkg/testing/ess/serverless.go index 817ee33f03b..df1129e8e97 100644 --- a/pkg/testing/ess/serverless.go +++ b/pkg/testing/ess/serverless.go @@ -17,7 +17,7 @@ import ( "github.com/elastic/elastic-agent/pkg/testing/runner" ) -var serverlessURL = "https://staging.found.no" +var serverlessURL = "https://cloud.elastic.co" // ServerlessClient is the handler the serverless ES instance type ServerlessClient struct { diff --git a/pkg/testing/ess/serverless_provision.go b/pkg/testing/ess/serverless_provision.go index 32ec8f8227b..c9656f628af 100644 --- a/pkg/testing/ess/serverless_provision.go +++ b/pkg/testing/ess/serverless_provision.go @@ -10,7 +10,7 @@ import ( "fmt" "io" "net/http" - "sync" + "time" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/pkg/testing/runner" @@ -18,17 +18,15 @@ import ( // ServerlessProvision contains type ServerlessProvision struct { - stacksMut sync.RWMutex - stacks map[string]stackhandlerData - cfg ProvisionerConfig - log runner.Logger + cfg ProvisionerConfig + log runner.Logger } type defaultLogger struct { wrapped *logp.Logger } -// / implements the runner.Logger interface +// Logf implements the runner.Logger interface func (log *defaultLogger) Logf(format string, args ...any) { if len(args) == 0 { @@ -38,12 +36,6 @@ func (log *defaultLogger) Logf(format string, args ...any) { } -// tracks the data that maps to a single serverless deployment -type stackhandlerData struct { - client *ServerlessClient - stackData runner.Stack -} - // ServerlessRegions is the JSON response from the serverless regions API endpoint type ServerlessRegions struct { CSP string `json:"csp"` @@ -55,9 +47,8 @@ type ServerlessRegions struct { // NewServerlessProvisioner creates a new StackProvisioner instance for serverless func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) { prov := &ServerlessProvision{ - cfg: cfg, - stacks: map[string]stackhandlerData{}, - log: &defaultLogger{wrapped: logp.L()}, + cfg: cfg, + log: &defaultLogger{wrapped: logp.L()}, } err := prov.CheckCloudRegion() if err != nil { @@ -71,114 +62,118 @@ func (prov *ServerlessProvision) SetLogger(l runner.Logger) { prov.log = l } -// Provision a new set of serverless instances -func (prov *ServerlessProvision) Provision(ctx context.Context, requests []runner.StackRequest) ([]runner.Stack, error) { - upWaiter := sync.WaitGroup{} - depErrs := make(chan error, len(requests)) - depUp := make(chan bool, len(requests)) - stacks := []runner.Stack{} - for _, req := range requests { - client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log) - srvReq := ServerlessRequest{Name: req.ID, RegionID: prov.cfg.Region} - proj, err := client.DeployStack(ctx, srvReq) - if err != nil { - return nil, fmt.Errorf("error deploying stack for request %s: %w", req.ID, err) - } - err = client.WaitForEndpoints(ctx) - if err != nil { - return nil, fmt.Errorf("error waiting for endpoints to become available for request: %w", err) - } - newStack := runner.Stack{ - ID: req.ID, - Version: req.Version, - Elasticsearch: client.proj.Endpoints.Elasticsearch, - Kibana: client.proj.Endpoints.Kibana, - Username: client.proj.Credentials.Username, - Password: client.proj.Credentials.Password, - Internal: map[string]interface{}{ - "deployment_id": proj.ID, - "deployment_type": proj.Type, - }, - } - stacks = append(stacks, newStack) - prov.stacksMut.Lock() - prov.stacks[req.ID] = stackhandlerData{client: client, stackData: newStack} - prov.stacksMut.Unlock() +// Create creates a stack. +func (prov *ServerlessProvision) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { + // allow up to 4 minutes for requests + createCtx, createCancel := context.WithTimeout(ctx, 4*time.Minute) + defer createCancel() - upWaiter.Add(1) - go func() { - isUp, err := client.DeploymentIsReady(ctx) - if err != nil { - depErrs <- err + client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log) + srvReq := ServerlessRequest{Name: request.ID, RegionID: prov.cfg.Region} - } - depUp <- isUp - }() + prov.log.Logf("Creating serverless stack %s [stack_id: %s]", request.Version, request.ID) + proj, err := client.DeployStack(createCtx, srvReq) + if err != nil { + return runner.Stack{}, fmt.Errorf("error deploying stack for request %s: %w", request.ID, err) + } + err = client.WaitForEndpoints(createCtx) + if err != nil { + return runner.Stack{}, fmt.Errorf("error waiting for endpoints to become available for serverless stack %s [stack_id: %s, deployment_id: %s]: %w", request.Version, request.ID, proj.ID, err) + } + stack := runner.Stack{ + ID: request.ID, + Version: request.Version, + Elasticsearch: client.proj.Endpoints.Elasticsearch, + Kibana: client.proj.Endpoints.Kibana, + Username: client.proj.Credentials.Username, + Password: client.proj.Credentials.Password, + Internal: map[string]interface{}{ + "deployment_id": proj.ID, + "deployment_type": proj.Type, + }, + Ready: false, } + prov.log.Logf("Created serverless stack %s [stack_id: %s, deployment_id: %s]", request.Version, request.ID, proj.ID) + return stack, nil +} + +// WaitForReady should block until the stack is ready or the context is cancelled. +func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { + deploymentID, deploymentType, err := prov.getDeploymentInfo(stack) + if err != nil { + return stack, fmt.Errorf("failed to get deployment info from the stack: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + + client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log) + client.proj.ID = deploymentID + client.proj.Type = deploymentType + client.proj.Region = prov.cfg.Region + client.proj.Endpoints.Elasticsearch = stack.Elasticsearch + client.proj.Endpoints.Kibana = stack.Kibana + client.proj.Credentials.Username = stack.Username + client.proj.Credentials.Password = stack.Password + + prov.log.Logf("Waiting for serverless stack %s to be ready [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID) + + errCh := make(chan error) + var lastErr error + + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() - gotUp := 0 for { select { case <-ctx.Done(): - return nil, ctx.Err() - case err := <-depErrs: - return nil, fmt.Errorf("error waiting for stacks to become available: %w", err) - case isUp := <-depUp: - if isUp { - gotUp++ + if lastErr == nil { + lastErr = ctx.Err() } - if gotUp >= len(requests) { - return stacks, nil + return stack, fmt.Errorf("serverless stack %s [stack_id: %s, deployment_id: %s] never became ready: %w", stack.Version, stack.ID, deploymentID, lastErr) + case <-ticker.C: + go func() { + statusCtx, statusCancel := context.WithTimeout(ctx, 30*time.Second) + defer statusCancel() + ready, err := client.DeploymentIsReady(statusCtx) + if err != nil { + errCh <- err + } else if !ready { + errCh <- fmt.Errorf("serverless stack %s [stack_id: %s, deployment_id: %s] never became ready", stack.Version, stack.ID, deploymentID) + } else { + errCh <- nil + } + }() + case err := <-errCh: + if err == nil { + stack.Ready = true + return stack, nil } + lastErr = err } } - } -// Clean shuts down and removes the deployments -func (prov *ServerlessProvision) Clean(ctx context.Context, stacks []runner.Stack) error { - for _, stack := range stacks { - prov.stacksMut.RLock() - // because of the way the provisioner initializes, - // we can't guarantee that we have a valid client/stack setup, as we might have just re-initialized from a file. - // If that's the case, create a new client - stackRef, ok := prov.stacks[stack.ID] - prov.stacksMut.RUnlock() - // we can't reference the client, it won't be created when we just run mage:clean - // instead, grab the project ID from `stacks`, create a new client - if ok { - err := stackRef.client.DeleteDeployment() - if err != nil { - prov.log.Logf("error removing deployment: %w", err) - } - } else { - // create a new client - client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log) - dep_id, ok := stack.Internal["deployment_id"] - if !ok { - return fmt.Errorf("could not find deployment_id for serverless") - } - dep_id_str, ok := dep_id.(string) - if !ok { - return fmt.Errorf("deployment_id is not a string: %v", dep_id) - } - client.proj.ID = dep_id_str - - dep_type, ok := stack.Internal["deployment_type"] - if !ok { - return fmt.Errorf("could not find deployment_type in stack for serverless") - } - dep_type_str, ok := dep_type.(string) - if !ok { - return fmt.Errorf("deployment_type is not a string: %v", dep_id_str) - } - client.proj.Type = dep_type_str - err := client.DeleteDeployment() - if err != nil { - return fmt.Errorf("error removing deployment after re-creating client: %w", err) - } +// Delete deletes a stack. +func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack) error { + deploymentID, deploymentType, err := prov.getDeploymentInfo(stack) + if err != nil { + return fmt.Errorf("failed to get deployment info from the stack: %w", err) + } - } + client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log) + client.proj.ID = deploymentID + client.proj.Type = deploymentType + client.proj.Region = prov.cfg.Region + client.proj.Endpoints.Elasticsearch = stack.Elasticsearch + client.proj.Endpoints.Kibana = stack.Kibana + client.proj.Credentials.Username = stack.Username + client.proj.Credentials.Password = stack.Password + + prov.log.Logf("Destroying serverless stack %s [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID) + err = client.DeleteDeployment() + if err != nil { + return fmt.Errorf("error removing serverless stack %s [stack_id: %s, deployment_id: %s]: %w", stack.Version, stack.ID, deploymentID, err) } return nil } @@ -235,3 +230,26 @@ func (prov *ServerlessProvision) CheckCloudRegion() error { return nil } + +func (prov *ServerlessProvision) getDeploymentInfo(stack runner.Stack) (string, string, error) { + if stack.Internal == nil { + return "", "", fmt.Errorf("missing internal information") + } + deploymentIDRaw, ok := stack.Internal["deployment_id"] + if !ok { + return "", "", fmt.Errorf("missing internal deployment_id") + } + deploymentID, ok := deploymentIDRaw.(string) + if !ok { + return "", "", fmt.Errorf("internal deployment_id not a string") + } + deploymentTypeRaw, ok := stack.Internal["deployment_type"] + if !ok { + return "", "", fmt.Errorf("missing internal deployment_type") + } + deploymentType, ok := deploymentTypeRaw.(string) + if !ok { + return "", "", fmt.Errorf("internal deployment_type is not a string") + } + return deploymentID, deploymentType, nil +} diff --git a/pkg/testing/ess/serverless_test.go b/pkg/testing/ess/serverless_test.go index 7f69e49819f..2fc8e0075b1 100644 --- a/pkg/testing/ess/serverless_test.go +++ b/pkg/testing/ess/serverless_test.go @@ -26,9 +26,8 @@ func TestProvisionGetRegions(t *testing.T) { cfg := ProvisionerConfig{Region: "bad-region-ID", APIKey: key} prov := &ServerlessProvision{ - cfg: cfg, - stacks: map[string]stackhandlerData{}, - log: &defaultLogger{wrapped: logp.L()}, + cfg: cfg, + log: &defaultLogger{wrapped: logp.L()}, } err = prov.CheckCloudRegion() require.NoError(t, err) @@ -48,27 +47,23 @@ func TestStackProvisioner(t *testing.T) { cfg := ProvisionerConfig{Region: "aws-eu-west-1", APIKey: key} provClient, err := NewServerlessProvisioner(cfg) require.NoError(t, err) - stacks := []runner.StackRequest{ - {ID: "stack-test-one", Version: "8.9.0"}, - {ID: "stack-test-two", Version: "8.9.0"}, - } + request := runner.StackRequest{ID: "stack-test-one", Version: "8.9.0"} ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) defer cancel() - res, err := provClient.Provision(ctx, stacks) + stack, err := provClient.Create(ctx, request) require.NoError(t, err) t.Logf("got results:") - for _, stack := range res { - t.Logf("stack: %#v", stack) - require.NotEmpty(t, stack.Elasticsearch) - require.NotEmpty(t, stack.Kibana) - require.NotEmpty(t, stack.Password) - require.NotEmpty(t, stack.Username) - } + t.Logf("stack: %#v", stack) + require.NotEmpty(t, stack.Elasticsearch) + require.NotEmpty(t, stack.Kibana) + require.NotEmpty(t, stack.Password) + require.NotEmpty(t, stack.Username) + stack, err = provClient.WaitForReady(ctx, stack) + require.NoError(t, err) t.Logf("tearing down...") - err = provClient.Clean(ctx, res) + err = provClient.Delete(ctx, stack) require.NoError(t, err) - } func TestStartServerless(t *testing.T) { diff --git a/pkg/testing/runner/provisioner.go b/pkg/testing/runner/provisioner.go index fb474e2f307..2708b0d204d 100644 --- a/pkg/testing/runner/provisioner.go +++ b/pkg/testing/runner/provisioner.go @@ -57,6 +57,9 @@ type Stack struct { // Version is the version of the stack. Version string `yaml:"version"` + // Ready determines if the stack is ready to be used. + Ready bool `yaml:"ready"` + // Elasticsearch is the URL to communicate with elasticsearch. Elasticsearch string `yaml:"elasticsearch"` @@ -89,11 +92,12 @@ type StackProvisioner interface { // SetLogger sets the logger for it to use. SetLogger(l Logger) - // Provision brings up the stacks - // - // The provision should re-use already prepared stacks when possible. - Provision(ctx context.Context, requests []StackRequest) ([]Stack, error) + // Create creates a stack. + Create(ctx context.Context, request StackRequest) (Stack, error) - // Clean cleans up all provisioned resources. - Clean(ctx context.Context, stacks []Stack) error + // WaitForReady should block until the stack is ready or the context is cancelled. + WaitForReady(ctx context.Context, stack Stack) (Stack, error) + + // Delete deletes the stack. + Delete(ctx context.Context, stack Stack) error } diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go index 0541fa785ec..a2c77f77aa0 100644 --- a/pkg/testing/runner/runner.go +++ b/pkg/testing/runner/runner.go @@ -128,10 +128,11 @@ type Runner struct { ip InstanceProvisioner sp StackProvisioner - batches []OSBatch - batchToStack map[string]Stack - stacksReady sync.WaitGroup - stacksErr error + batches []OSBatch + + batchToStack map[string]stackRes + batchToStackCh map[string]chan stackRes + batchToStackMx sync.Mutex stateMx sync.Mutex state State @@ -172,12 +173,13 @@ func NewRunner(cfg Config, ip InstanceProvisioner, sp StackProvisioner, batches osBatches = filterSupportedOS(osBatches, ip) r := &Runner{ - cfg: cfg, - logger: logger, - ip: ip, - sp: sp, - batches: osBatches, - batchToStack: make(map[string]Stack), + cfg: cfg, + logger: logger, + ip: ip, + sp: sp, + batches: osBatches, + batchToStack: make(map[string]stackRes), + batchToStackCh: make(map[string]chan stackRes), } err = r.loadState() @@ -274,11 +276,15 @@ func (r *Runner) Clean() error { defer cancel() return r.ip.Clean(ctx, r.cfg, instances) }) - g.Go(func() error { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - return r.sp.Clean(ctx, stacks) - }) + for _, stack := range stacks { + g.Go(func(stack Stack) func() error { + return func() error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + return r.sp.Delete(ctx, stack) + } + }(stack)) + } return g.Wait() } @@ -366,14 +372,10 @@ func (r *Runner) runInstance(ctx context.Context, sshAuth ssh.AuthMethod, logger // ensure that we have all the requirements for the stack if required if batch.Batch.Stack != nil { // wait for the stack to be ready before continuing - logger.Logf("Waiting for stacks to be ready...") - r.stacksReady.Wait() - if r.stacksErr != nil { - return OSRunnerResult{}, fmt.Errorf("%s unable to continue because stack never became ready: %w", instance.Name, r.stacksErr) - } - stack, ok := r.getStackForBatchID(batch.ID) - if !ok { - return OSRunnerResult{}, fmt.Errorf("failed to find stack for batch %s", batch.ID) + logger.Logf("Waiting for stack to be ready...") + stack, err := r.getStackForBatchID(batch.ID) + if err != nil { + return OSRunnerResult{}, err } env["ELASTICSEARCH_HOST"] = stack.Elasticsearch env["ELASTICSEARCH_USERNAME"] = stack.Username @@ -574,9 +576,6 @@ func (r *Runner) createRepoArchive(ctx context.Context, repoDir string, dir stri // startStacks starts the stacks required for the tests to run func (r *Runner) startStacks(ctx context.Context) error { - // stacks never start ready - r.stacksReady.Add(1) - var versions []string batchToVersion := make(map[string]string) for _, lb := range r.batches { @@ -592,59 +591,115 @@ func (r *Runner) startStacks(ctx context.Context) error { } } - var requests []StackRequest + var requests []stackReq for _, version := range versions { id := strings.Replace(version, ".", "", -1) stack, ok := r.findStack(id) if ok { - r.logger.Logf("Reusing stack %s (%s)", version, id) - for batchID, batchVersion := range batchToVersion { - if batchVersion == version { - r.batchToStack[batchID] = stack - } - } + requests = append(requests, stackReq{ + request: StackRequest{ + ID: id, + Version: version, + }, + stack: &stack, + }) } else { - requests = append(requests, StackRequest{ - ID: id, - Version: version, + requests = append(requests, stackReq{ + request: StackRequest{ + ID: id, + Version: version, + }, }) } } - if len(requests) == 0 { - // no need to request any other stacks - r.stacksReady.Done() - return nil + + reportResult := func(version string, stack Stack, err error) { + r.batchToStackMx.Lock() + defer r.batchToStackMx.Unlock() + res := stackRes{ + stack: stack, + err: err, + } + for batchID, batchVersion := range batchToVersion { + if batchVersion == version { + r.batchToStack[batchID] = res + ch, ok := r.batchToStackCh[batchID] + if ok { + ch <- res + } + } + } } - // start go routine to provision the needed stacks - go func(ctx context.Context) { - defer r.stacksReady.Done() + // start goroutines to provision the needed stacks + for _, request := range requests { + go func(ctx context.Context, req stackReq) { + var err error + var stack Stack + if req.stack != nil { + stack = *req.stack + } else { + stack, err = r.sp.Create(ctx, req.request) + if err != nil { + reportResult(req.request.Version, stack, err) + return + } + err = r.addOrUpdateStack(stack) + if err != nil { + reportResult(stack.Version, stack, err) + return + } + } + + if stack.Ready { + reportResult(stack.Version, stack, nil) + return + } - stacks, err := r.sp.Provision(ctx, requests) - if err != nil { - r.stacksErr = err - return - } - for _, stack := range stacks { - err := r.addOrUpdateStack(stack) + stack, err = r.sp.WaitForReady(ctx, stack) if err != nil { - r.stacksErr = err + reportResult(stack.Version, stack, err) return } - for batchID, batchVersion := range batchToVersion { - if batchVersion == stack.Version { - r.batchToStack[batchID] = stack - } + + err = r.addOrUpdateStack(stack) + if err != nil { + reportResult(stack.Version, stack, err) + return } - } - }(ctx) + + reportResult(stack.Version, stack, nil) + }(ctx, request) + } return nil } -func (r *Runner) getStackForBatchID(id string) (Stack, bool) { - stack, ok := r.batchToStack[id] - return stack, ok +func (r *Runner) getStackForBatchID(id string) (Stack, error) { + r.batchToStackMx.Lock() + res, ok := r.batchToStack[id] + if ok { + r.batchToStackMx.Unlock() + return res.stack, res.err + } + _, ok = r.batchToStackCh[id] + if ok { + return Stack{}, fmt.Errorf("getStackForBatchID called twice; this is not allowed") + } + ch := make(chan stackRes, 1) + r.batchToStackCh[id] = ch + r.batchToStackMx.Unlock() + + // 12 minutes is because the stack should have been ready after 10 minutes or returned an error + // this only exists to ensure that if that code is not blocking that this doesn't block forever + t := time.NewTimer(12 * time.Minute) + defer t.Stop() + select { + case <-t.C: + return Stack{}, fmt.Errorf("failed waiting for a response after 12 minutes") + case res = <-ch: + return res.stack, res.err + } } func (r *Runner) findInstance(id string) (StateInstance, bool) { @@ -986,3 +1041,13 @@ type batchLogger struct { func (b *batchLogger) Logf(format string, args ...any) { b.wrapped.Logf("(%s) %s", b.prefix, fmt.Sprintf(format, args...)) } + +type stackRes struct { + stack Stack + err error +} + +type stackReq struct { + request StackRequest + stack *Stack +} diff --git a/pkg/testing/runner/runner_test.go b/pkg/testing/runner/runner_test.go index d10b9d524d0..c46b3b53761 100644 --- a/pkg/testing/runner/runner_test.go +++ b/pkg/testing/runner/runner_test.go @@ -8,6 +8,7 @@ import ( "context" "os" "path/filepath" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -86,7 +87,7 @@ func TestNewRunner_Clean(t *testing.T) { require.NoError(t, err) assert.ElementsMatch(t, ip.instances, []Instance{i1, i2}) - assert.ElementsMatch(t, sp.stacks, []Stack{s1, s2}) + assert.ElementsMatch(t, sp.deletedStacks, []Stack{s1, s2}) } type fakeInstanceProvisioner struct { @@ -123,31 +124,38 @@ func (f *fakeInstanceProvisioner) Clean(_ context.Context, _ Config, instances [ } type fakeStackProvisioner struct { - requests []StackRequest - stacks []Stack + mx sync.Mutex + requests []StackRequest + deletedStacks []Stack } func (f *fakeStackProvisioner) SetLogger(_ Logger) { } -func (f *fakeStackProvisioner) Provision(_ context.Context, requests []StackRequest) ([]Stack, error) { - f.requests = requests - var stacks []Stack - for _, req := range requests { - stacks = append(stacks, Stack{ - ID: req.ID, - Version: req.Version, - Elasticsearch: "http://localhost:9200", - Kibana: "http://localhost:5601", - Username: "elastic", - Password: "changeme", - Internal: nil, - }) - } - return stacks, nil +func (f *fakeStackProvisioner) Create(_ context.Context, request StackRequest) (Stack, error) { + f.mx.Lock() + defer f.mx.Unlock() + f.requests = append(f.requests, request) + return Stack{ + ID: request.ID, + Version: request.Version, + Elasticsearch: "http://localhost:9200", + Kibana: "http://localhost:5601", + Username: "elastic", + Password: "changeme", + Internal: nil, + Ready: false, + }, nil +} + +func (f *fakeStackProvisioner) WaitForReady(_ context.Context, stack Stack) (Stack, error) { + stack.Ready = true + return stack, nil } -func (f *fakeStackProvisioner) Clean(_ context.Context, stacks []Stack) error { - f.stacks = stacks +func (f *fakeStackProvisioner) Delete(_ context.Context, stack Stack) error { + f.mx.Lock() + defer f.mx.Unlock() + f.deletedStacks = append(f.deletedStacks, stack) return nil } From d1fffb3cf34bd9c619004fc92165afb351fed8d7 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Tue, 14 Nov 2023 07:16:23 -0300 Subject: [PATCH 12/12] multipass: ensure instance does not exist (#3714) Before the mutipass provisioner tries to launch an instance, if it already exists, it tries to delete and purge the instance. --- pkg/testing/multipass/provisioner.go | 88 +++++++++++++++++++++- testing/integration/logs_ingestion_test.go | 18 +++-- 2 files changed, 96 insertions(+), 10 deletions(-) diff --git a/pkg/testing/multipass/provisioner.go b/pkg/testing/multipass/provisioner.go index 2be04b806fc..7703dc7e335 100644 --- a/pkg/testing/multipass/provisioner.go +++ b/pkg/testing/multipass/provisioner.go @@ -7,6 +7,7 @@ package multipass import ( "bytes" "context" + "encoding/json" "fmt" "os" "os/exec" @@ -122,6 +123,12 @@ func (p *provisioner) Clean(ctx context.Context, _ runner.Config, instances []ru // launch creates an instance. func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runner.OSBatch) error { + // check if instance already exists + err := p.ensureInstanceNotExist(ctx, batch) + if err != nil { + p.logger.Logf( + "could not check multipass instance %q does not exists, moving on anyway. Err: %v", err) + } args := []string{ "launch", "-c", "2", @@ -145,9 +152,14 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne return fmt.Errorf("failed to marshal cloud-init configuration: %w", err) } + p.logger.Logf("Launching multipass instance %s", batch.ID) var output bytes.Buffer - p.logger.Logf("Launching multipass image %s", batch.ID) - proc, err := process.Start("multipass", process.WithContext(ctx), process.WithArgs(args), process.WithCmdOptions(runner.AttachOut(&output), runner.AttachErr(&output))) + proc, err := process.Start("multipass", + process.WithContext(ctx), + process.WithArgs(args), + process.WithCmdOptions( + runner.AttachOut(&output), + runner.AttachErr(&output))) if err != nil { return fmt.Errorf("failed to run multipass launch: %w", err) } @@ -162,7 +174,7 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne } _ = proc.Stdin.Close() ps := <-proc.Wait() - if ps.ExitCode() != 0 { + if !ps.Success() { // print the output so its clear what went wrong fmt.Fprintf(os.Stdout, "%s\n", output.Bytes()) return fmt.Errorf("failed to run multipass launch: exited with code: %d", ps.ExitCode()) @@ -170,6 +182,76 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne return nil } +func (p *provisioner) ensureInstanceNotExist(ctx context.Context, batch runner.OSBatch) error { + var output bytes.Buffer + var stdErr bytes.Buffer + proc, err := process.Start("multipass", + process.WithContext(ctx), + process.WithArgs([]string{"list", "--format", "json"}), + process.WithCmdOptions( + runner.AttachOut(&output), + runner.AttachErr(&stdErr))) + if err != nil { + return fmt.Errorf("multipass list failed to run: %w", err) + } + + state := <-proc.Wait() + if !state.Success() { + msg := fmt.Sprintf("multipass list exited with non-zero status: %s", + state.String()) + p.logger.Logf(msg) + p.logger.Logf("output: %s", output.String()) + p.logger.Logf("stderr: %s", stdErr.String()) + return fmt.Errorf(msg) + } + list := struct { + List []struct { + Ipv4 []string `json:"ipv4"` + Name string `json:"name"` + Release string `json:"release"` + State string `json:"state"` + } `json:"list"` + }{} + err = json.NewDecoder(&output).Decode(&list) + if err != nil { + return fmt.Errorf("could not decode mutipass list output: %w", err) + } + + for _, i := range list.List { + if i.Name == batch.ID { + p.logger.Logf("multipass trying to delete instance %s", batch.ID) + + output.Reset() + stdErr.Reset() + proc, err = process.Start("multipass", + process.WithContext(ctx), + process.WithArgs([]string{"delete", "--purge", batch.ID}), + process.WithCmdOptions( + runner.AttachOut(&output), + runner.AttachErr(&stdErr))) + if err != nil { + return fmt.Errorf( + "multipass instance %q already exist, state %q. Could not delete it: %w", + batch.ID, i.State, err) + } + state = <-proc.Wait() + if !state.Success() { + msg := fmt.Sprintf("failed to delete and purge multipass instance %s: %s", + batch.ID, + state.String()) + p.logger.Logf(msg) + p.logger.Logf("output: %s", output.String()) + p.logger.Logf("stderr: %s", stdErr.String()) + return fmt.Errorf(msg) + } + + break + } + } + + return nil +} + // delete deletes an instance. func (p *provisioner) delete(ctx context.Context, instance runner.Instance) error { args := []string{ diff --git a/testing/integration/logs_ingestion_test.go b/testing/integration/logs_ingestion_test.go index ba9a84673b0..d9fb2f511a8 100644 --- a/testing/integration/logs_ingestion_test.go +++ b/testing/integration/logs_ingestion_test.go @@ -104,11 +104,15 @@ func testMonitoringLogsAreShipped( ) { // Stage 1: Make sure metricbeat logs are populated t.Log("Making sure metricbeat logs are populated") - docs := findESDocs(t, func() (estools.Documents, error) { - return estools.GetLogsForDataset(info.ESClient, "elastic_agent.metricbeat") - }) - t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits)) - require.NotZero(t, len(docs.Hits.Hits)) + require.Eventually(t, + func() bool { + docs := findESDocs(t, func() (estools.Documents, error) { + return estools.GetLogsForDataset(info.ESClient, "elastic_agent.metricbeat") + }) + return len(docs.Hits.Hits) > 0 + }, + 1*time.Minute, 500*time.Millisecond, + "there should be metricbeats logs by now") // Stage 2: make sure all components are healthy t.Log("Making sure all components are healthy") @@ -123,7 +127,7 @@ func testMonitoringLogsAreShipped( // Stage 3: Make sure there are no errors in logs t.Log("Making sure there are no error logs") - docs = findESDocs(t, func() (estools.Documents, error) { + docs := findESDocs(t, func() (estools.Documents, error) { return estools.CheckForErrorsInLogs(info.ESClient, info.Namespace, []string{ // acceptable error messages (include reason) "Error dialing dial tcp 127.0.0.1:9200: connect: connection refused", // beat is running default config before its config gets updated @@ -134,7 +138,7 @@ func testMonitoringLogsAreShipped( "elastic-agent-client error: rpc error: code = Canceled desc = context canceled", // can happen on restart }) }) - t.Logf("errors: Got %d documents", len(docs.Hits.Hits)) + t.Logf("error logs: Got %d documents", len(docs.Hits.Hits)) for _, doc := range docs.Hits.Hits { t.Logf("%#v", doc.Source) }