grafana
diff --git a/‎docs/sources/shared/configuration.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/sources/shared/configuration.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/engine/engine.go‎
Lines changed: 5 additions & 1 deletion b/‎pkg/engine/engine.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pkg/engine/internal/worker/worker_test.go‎
Lines changed: 5 additions & 1 deletion b/‎pkg/engine/internal/worker/worker_test.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pkg/engine/internal/workflow/admission_control.go‎
Lines changed: 91 additions & 0 deletions b/‎pkg/engine/internal/workflow/admission_control.go‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎pkg/engine/internal/workflow/admission_control_test.go‎
Lines changed: 38 additions & 0 deletions b/‎pkg/engine/internal/workflow/admission_control_test.go‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎pkg/engine/internal/workflow/workflow.go‎
Lines changed: 67 additions & 10 deletions b/‎pkg/engine/internal/workflow/workflow.go‎
Lines changed: 67 additions & 10 deletions
@@ -4707,6 +4707,12 @@ otlp_config:
 # override is set, the encryption context will not be provided to S3. Ignored if
 # the SSE type override is not set.
 [s3_sse_kms_encryption_context: <string> | default = ""]
+
+# Experimental: Controls the amount of scan tasks that can be running in
+# parallel in the new query engine. The default of 0 means unlimited parallelism
+# and all tasks will be scheduled at once.
+# CLI flag: -limits.max-scan-task-parallelism
+[max_scan_task_parallelism: <int> | default = 0]
 ```
 
 ### local_storage_config
 
@@ -323,7 +323,11 @@ func (e *Engine) buildWorkflow(ctx context.Context, logger log.Logger, physicalP
 	span := trace.SpanFromContext(ctx)
 	timer := prometheus.NewTimer(e.metrics.workflowPlanning)
 
-	wf, err := workflow.New(logger, tenantID, e.scheduler.inner, physicalPlan)
+	opts := workflow.Options{
+		MaxRunningScanTasks:  e.limits.MaxScanTaskParallelism(tenantID),
+		MaxRunningOtherTasks: 0,
+	}
+	wf, err := workflow.New(opts, logger, tenantID, e.scheduler.inner, physicalPlan)
 	if err != nil {
 		level.Warn(logger).Log("msg", "failed to create workflow", "err", err)
 		span.RecordError(err)
 
@@ -158,7 +158,11 @@ func buildWorkflow(ctx context.Context, t *testing.T, logger log.Logger, loc obj
 		fmt.Fprintln(os.Stderr, physical.PrintAsTree(plan))
 	}
 
-	wf, err := workflow.New(logger, objtest.Tenant, sched, plan)
+	opts := workflow.Options{
+		MaxRunningScanTasks:  32,
+		MaxRunningOtherTasks: 0, // unlimited
+	}
+	wf, err := workflow.New(opts, logger, objtest.Tenant, sched, plan)
 	require.NoError(t, err)
 
 	if testing.Verbose() {
 
@@ -0,0 +1,91 @@
+package workflow
+
+import (
+	"math"
+
+	"golang.org/x/sync/semaphore"
+
+	"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
+)
+
+type taskType string
+
+const (
+	taskTypeScan  taskType = "scan"
+	taskTypeOther taskType = "other"
+)
+
+type admissionLane struct {
+	*semaphore.Weighted
+	capacity int64
+	lane     taskType
+}
+
+func newAdmissionLane(lane taskType, capacity int64) *admissionLane {
+	return &admissionLane{
+		Weighted: semaphore.NewWeighted(capacity),
+		capacity: capacity,
+		lane:     lane,
+	}
+}
+
+// admissionControl is a control structure to lookup "admission lanes" for different types of tasks.
+// It is a lightweight wrapper around a mapping of task type to admission lane.
+type admissionControl struct {
+	mapping map[taskType]*admissionLane
+}
+
+func newAdmissionControl(maxScanTasks, maxOtherTasks int64) *admissionControl {
+	if maxScanTasks < 1 {
+		maxScanTasks = math.MaxInt64
+	}
+	if maxOtherTasks < 1 {
+		maxOtherTasks = math.MaxInt64
+	}
+
+	return &admissionControl{
+		mapping: map[taskType]*admissionLane{
+			taskTypeScan:  newAdmissionLane(taskTypeScan, maxScanTasks),
+			taskTypeOther: newAdmissionLane(taskTypeOther, maxOtherTasks),
+		},
+	}
+}
+
+// groupByBucket categorizes a slice of tasks into groups based on their characteristics (scan, other, ...).
+func (ac *admissionControl) groupByType(tasks []*Task) map[taskType][]*Task {
+	groups := map[taskType][]*Task{
+		taskTypeScan:  make([]*Task, 0, len(tasks)),
+		taskTypeOther: make([]*Task, 0, len(tasks)),
+	}
+
+	for _, t := range tasks {
+		ty := ac.typeFor(t)
+		groups[ty] = append(groups[ty], t)
+	}
+
+	return groups
+}
+
+func (ac *admissionControl) typeFor(task *Task) taskType {
+	if isScanTask(task) {
+		return taskTypeScan
+	}
+	return taskTypeOther
+}
+
+func (ac *admissionControl) laneFor(task *Task) *admissionLane {
+	return ac.mapping[ac.typeFor(task)]
+}
+
+func (ac *admissionControl) get(ty taskType) *admissionLane {
+	return ac.mapping[ty]
+}
+
+func isScanTask(task *Task) bool {
+	for node := range task.Fragment.Graph().Nodes() {
+		if node.Type() == physical.NodeTypeDataObjScan {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,38 @@
+package workflow
+
+import (
+	"math"
+	"testing"
+
+	"github.com/oklog/ulid/v2"
+	"github.com/stretchr/testify/require"
+
+	"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
+	"github.com/grafana/loki/v3/pkg/engine/internal/util/dag"
+)
+
+func TestAdmissionControl_getBucket(t *testing.T) {
+	ac := newAdmissionControl(32, math.MaxInt64)
+
+	t.Run("Task without a DataObjScan node is considered an 'other' task", func(t *testing.T) {
+		fragment := dag.Graph[physical.Node]{}
+		task := &Task{
+			ULID:     ulid.Make(),
+			Fragment: physical.FromGraph(fragment),
+		}
+		bucket := ac.typeFor(task)
+		require.Equal(t, taskTypeOther, bucket)
+	})
+
+	t.Run("Task with a DataObjScan node is considered an 'scan' task", func(t *testing.T) {
+		fragment := dag.Graph[physical.Node]{}
+		fragment.Add(&physical.DataObjScan{})
+
+		task := &Task{
+			ULID:     ulid.Make(),
+			Fragment: physical.FromGraph(fragment),
+		}
+		ty := ac.typeFor(task)
+		require.Equal(t, taskTypeScan, ty)
+	})
+}
@@ -4,6 +4,7 @@ package workflow
 
 import (
 	"context"
+	"fmt"
 	"sync"
 
 	"github.com/apache/arrow-go/v18/arrow"
@@ -17,9 +18,15 @@ import (
 	"github.com/grafana/loki/v3/pkg/logqlmodel/stats"
 )
 
+type Options struct {
+	MaxRunningScanTasks  int
+	MaxRunningOtherTasks int
+}
+
 // Workflow represents a physical plan that has been partitioned into
 // parallelizable tasks.
 type Workflow struct {
+	opts          Options
 	logger        log.Logger
 	runner        Runner
 	graph         dag.Graph[*Task]
@@ -33,14 +40,16 @@ type Workflow struct {
 
 	streamsMut   sync.RWMutex
 	streamStates map[*Stream]StreamState
+
+	admissionControl *admissionControl
 }
 
 // New creates a new Workflow from a physical plan. New returns an error if the
 // physical plan does not have exactly one root node, or if the physical plan
 // cannot be partitioned into a Workflow.
 //
 // The provided Runner will be used for Workflow execution.
-func New(logger log.Logger, tenantID string, runner Runner, plan *physical.Plan) (*Workflow, error) {
+func New(opts Options, logger log.Logger, tenantID string, runner Runner, plan *physical.Plan) (*Workflow, error) {
 	graph, err := planWorkflow(tenantID, plan)
 	if err != nil {
 		return nil, err
@@ -53,6 +62,7 @@ func New(logger log.Logger, tenantID string, runner Runner, plan *physical.Plan)
 	}
 
 	return &Workflow{
+		opts:          opts,
 		logger:        logger,
 		runner:        runner,
 		graph:         graph,
@@ -133,19 +143,54 @@ func (wf *Workflow) Run(ctx context.Context) (pipeline executor.Pipeline, err er
 		}
 	}
 
-	// TODO(rfratto): For logs queries, we want a system to limit how many scan
-	// tasks get sent to the runner at once.
-	//
-	// This will limit unnecessary resource consumption of workflows when
-	// there's a lot of compute capacity.
-	if err := wf.runner.Start(ctx, wrappedHandler, tasks...); err != nil {
-		pipeline.Close()
-		return nil, err
-	}
+	// Start dispatching in background goroutine
+	go func() {
+		err := wf.dispatchTasks(ctx, wrappedHandler, tasks)
+		if err != nil {
+			wrapped.SetError(err)
+			wrapped.Close()
+		}
+	}()
 
 	return wrapped, nil
 }
 
+// dispatchTasks groups the slice of tasks by their associated "admission lane" (token bucket)
+// and dispatches them to the runner.
+// Tasks from different admission lanes are dispatched concurrently.
+// The caller needs to wait on the returned error group.
+func (wf *Workflow) dispatchTasks(ctx context.Context, handler TaskEventHandler, tasks []*Task) error {
+	wf.admissionControl = newAdmissionControl(
+		int64(wf.opts.MaxRunningScanTasks),
+		int64(wf.opts.MaxRunningOtherTasks),
+	)
+
+	groups := wf.admissionControl.groupByType(tasks)
+	for _, taskType := range []taskType{
+		taskTypeOther,
+		taskTypeScan,
+	} {
+		lane := wf.admissionControl.get(taskType)
+		tasks := groups[taskType]
+
+		var offset int64
+		total := int64(len(tasks))
+		maxBatchSize := min(total, lane.capacity)
+
+		for ; offset < total; offset += maxBatchSize {
+			batchSize := min(maxBatchSize, total-offset)
+			if err := lane.Acquire(ctx, batchSize); err != nil {
+				return fmt.Errorf("failed to acquire tokens from admission lane %s: %w", taskType, err)
+			}
+			if err := wf.runner.Start(ctx, handler, tasks[offset:offset+batchSize]...); err != nil {
+				return fmt.Errorf("failed to start tasks: %w", err)
+			}
+		}
+	}
+
+	return nil
+}
+
 func (wf *Workflow) allStreams() []*Stream {
 	var (
 		result      []*Stream
@@ -211,13 +256,25 @@ func (wf *Workflow) onTaskChange(ctx context.Context, task *Task, newStatus Task
 	level.Debug(wf.logger).Log("msg", "task state change", "task_id", task.ULID, "new_state", newStatus.State)
 
 	wf.tasksMut.Lock()
+	oldState := wf.taskStates[task]
 	wf.taskStates[task] = newStatus.State
 	wf.tasksMut.Unlock()
 
+	if oldState == newStatus.State {
+		return
+	}
+
 	if !newStatus.State.Terminal() {
 		return
 	}
 
+	if wf.admissionControl == nil {
+		level.Warn(wf.logger).Log("msg", "admission control was not initialised")
+	} else if oldState == TaskStatePending || oldState == TaskStateRunning {
+		// Release tokens only if the task was already enqueued and therefore either pending or running.
+		defer wf.admissionControl.laneFor(task).Release(1)
+	}
+
 	if newStatus.Statistics != nil {
 		wf.mergeResults(*newStatus.Statistics)
 	}