From 3dfb74bb2f30ce0db39971181d3c48f2637d4e5c Mon Sep 17 00:00:00 2001 From: Saquib Mian Date: Thu, 25 Jul 2024 10:01:02 -0400 Subject: [PATCH 1/3] Fix b5 digest calculation for remote modules See the comment in `added_module.go` for details. A minimal repro can be found [here](TODO). --- private/bufpkg/bufmodule/added_module.go | 114 ++++++++++-------- private/bufpkg/bufmodule/module.go | 97 +++++++-------- .../bufpkg/bufmodule/module_set_builder.go | 4 +- 3 files changed, 106 insertions(+), 109 deletions(-) diff --git a/private/bufpkg/bufmodule/added_module.go b/private/bufpkg/bufmodule/added_module.go index bcb6a21ff7..ee5e241c25 100644 --- a/private/bufpkg/bufmodule/added_module.go +++ b/private/bufpkg/bufmodule/added_module.go @@ -182,16 +182,37 @@ func (a *addedModule) ToModule( // e.proto // f.proto // g.proto + // module-d:c1 + // h.proto // // Then, you have this dependency graph: // // module-a -> module-b:c1, module-c:c2 // module-b:c1 -> module-c:c1 + // module-b:c2 -> module-c:c2 + // module-c:c1 -> module-d:c1 // - // Note that module-b depends on an earlier commit of module-c than module-a does. + // Note that module-a depends on an earlier commit of module-b, which in turn depends on an earlier commit of + // module-c, but module-a overrides the commit for module-c with the latest commit. Note also that the latest commit + // of module-c drops the dependency on module-d. // - // If we were to just use the dependencies in the ModuleSet to compute the digest, the following - // would happen as a calculation: + // The unreduced graph of dependencies for module-a looks like: + // + // b:c1 - c:c1 - d:c1 + // / + // a + // \ + // c:c2 + // + // Due to dependency resolution, module-c:c1 would be overriden by module-c:c2, and this graph would reduce down to: + // + // b:c1 + // / + // a + // \ + // c:c2 + // + // Calculating the b5 digest for module-a would result in the calculation: // // DIGEST(module-a) = digest( // // module-a contents @@ -220,11 +241,12 @@ func (a *addedModule) ToModule( // ), // ) // - // Note that to compute the digest of module-b:c1, we would actually use the digest of - // module-c:c2, as opposed to module-c:c1, since within the ModuleSet, we would resolve - // to use module-c:c2 instead of module-c:c1. + // Note that to compute the digest of module-b:c1 within the context of module-a, we would actually + // - use the digest of module-c:c2, as opposed to module-c:c1, since within the ModuleSet, we would resolve + // to use module-c:c2 instead of module-c:c1; + // - not even consider the digest of module-d:c1, since within the ModuleSet, module-d doesn't exist. // - // We should be using module-c:c1 to compute the digest of module-b:c1: + // We should be using module-c:c1 and module-d:c1 to compute the digest of module-b:c1, always: // // DIGEST(module-a) = digest( // // module-a contents @@ -241,6 +263,11 @@ func (a *addedModule) ToModule( // README.md, // e.proto, // f.proto, + // // module-d:c1 digest + // DIGEST( + // README.md, + // x.proto, + // ), // ), // ), // // module-c:c2 digest @@ -252,61 +279,44 @@ func (a *addedModule) ToModule( // ), // ) // - // To accomplish this, we need to take the dependencies of the declared ModuleKeys (ie what - // the Module actually says is in its buf.lock). This function enables us to do that for - // digest calculations. Within the Module, we say that if we get a remote Module, use the - // declared ModuleKeys instead of whatever Module we have resolved to for a given ModuleFullName. - getDeclaredDepModuleKeysB5 := func() ([]ModuleKey, error) { - moduleData, err := getModuleData() - if err != nil { - return nil, err - } - declaredDepModuleKeys, err := moduleData.DeclaredDepModuleKeys() + // However, module-d:c1 is not even present in the ModuleSet because it is not a dependency of module-c:c2. + // Effectively, when calculating the digest of a remote module, we need to be certain of two things: + // 1. That all dependencies of the remote module are in the ModuleSet, at exactly the commit that the remote module + // requires. + // 2. That (1) also holds transitively for dependencies of the remote module. + // + // These properties are impossible to guarantee all the time. For that reason, we cannot predictably calculate the B5 + // digest of a remote module within a different ModuleSet from the one it was originally calculated. Instead, we must + // defer to its pre-calculated digest (i.e., the one in the ModuleKey). + // + // Note that this _does not_ side-step tamper-proofing. Tamper-proofing occurs at the ModuleData level. + getRemoteB5Digest := func() (Digest, error) { + remoteDigest, err := a.remoteModuleKey.Digest() if err != nil { return nil, err } - if len(declaredDepModuleKeys) == 0 { - return nil, nil - } - var digestType DigestType - for i, moduleKey := range declaredDepModuleKeys { - digest, err := moduleKey.Digest() - if err != nil { - return nil, err - } - if i == 0 { - digestType = digest.Type() - } else if digestType != digest.Type() { - return nil, syserror.Newf("multiple digest types found in DeclaredDepModuleKeys: %v, %v", digestType, digest.Type()) - } - } + digestType := remoteDigest.Type() switch digestType { case DigestTypeB4: - // The declared ModuleKey dependencies for a commit may be stored in v1 buf.lock file, - // in which case they will use B4 digests. B4 digests aren't allowed to be used as - // input to the B5 digest calculation, so we perform a call to convert all ModuleKeys - // from B4 to B5 by using the commit provider. - commitKeysToFetch := make([]CommitKey, len(declaredDepModuleKeys)) - for i, declaredDepModuleKey := range declaredDepModuleKeys { - commitKey, err := NewCommitKey(declaredDepModuleKey.ModuleFullName().Registry(), declaredDepModuleKey.CommitID(), DigestTypeB5) - if err != nil { - return nil, err - } - commitKeysToFetch[i] = commitKey + commitKey, err := NewCommitKey( + a.remoteModuleKey.ModuleFullName().Registry(), + a.remoteModuleKey.CommitID(), + DigestTypeB5, + ) + if err != nil { + return nil, err } - commits, err := commitProvider.GetCommitsForCommitKeys(ctx, commitKeysToFetch) + commits, err := commitProvider.GetCommitsForCommitKeys(ctx, []CommitKey{commitKey}) if err != nil { return nil, err } - if len(commits) != len(commitKeysToFetch) { - return nil, syserror.Newf("expected %d commit(s), got %d", commitKeysToFetch, len(commits)) + if len(commits) != 1 { + return nil, syserror.Newf("expected %d commit(s), got %d", 1, len(commits)) } - return slicesext.Map(commits, func(commit Commit) ModuleKey { - return commit.ModuleKey() - }), nil + return commits[0].ModuleKey().Digest() case DigestTypeB5: - // No need to fetch b5 digests - we've already got them stored in the module's declared dependencies. - return declaredDepModuleKeys, nil + // We already have a remote B5 digest, simply return it. + return remoteDigest, nil default: return nil, syserror.Newf("unsupported digest type: %v", digestType) } @@ -321,7 +331,7 @@ func (a *addedModule) ToModule( false, getV1BufYAMLObjectData, getV1BufLockObjectData, - getDeclaredDepModuleKeysB5, + getRemoteB5Digest, a.remoteTargetPaths, a.remoteTargetExcludePaths, "", diff --git a/private/bufpkg/bufmodule/module.go b/private/bufpkg/bufmodule/module.go index 18eb3d3940..a46c360c7b 100644 --- a/private/bufpkg/bufmodule/module.go +++ b/private/bufpkg/bufmodule/module.go @@ -241,16 +241,16 @@ func ModuleDirectModuleDeps(module Module) ([]ModuleDep, error) { type module struct { ModuleReadBucket - ctx context.Context - getBucket func() (storage.ReadBucket, error) - bucketID string - moduleFullName ModuleFullName - commitID uuid.UUID - isTarget bool - isLocal bool - getV1BufYAMLObjectData func() (ObjectData, error) - getV1BufLockObjectData func() (ObjectData, error) - getDeclaredDepModuleKeysB5 func() ([]ModuleKey, error) + ctx context.Context + getBucket func() (storage.ReadBucket, error) + bucketID string + moduleFullName ModuleFullName + commitID uuid.UUID + isTarget bool + isLocal bool + getV1BufYAMLObjectData func() (ObjectData, error) + getV1BufLockObjectData func() (ObjectData, error) + getRemoteB5Digest func() (Digest, error) moduleSet ModuleSet @@ -270,7 +270,7 @@ func newModule( isLocal bool, getV1BufYAMLObjectData func() (ObjectData, error), getV1BufLockObjectData func() (ObjectData, error), - getDeclaredDepModuleKeysB5 func() ([]ModuleKey, error), + getRemoteB5Digest func() (Digest, error), targetPaths []string, targetExcludePaths []string, protoFileTargetPath string, @@ -313,16 +313,16 @@ func newModule( } module := &module{ - ctx: ctx, - getBucket: syncOnceValuesGetBucketWithStorageMatcherApplied, - bucketID: bucketID, - moduleFullName: moduleFullName, - commitID: commitID, - isTarget: isTarget, - isLocal: isLocal, - getV1BufYAMLObjectData: syncext.OnceValues(getV1BufYAMLObjectData), - getV1BufLockObjectData: syncext.OnceValues(getV1BufLockObjectData), - getDeclaredDepModuleKeysB5: syncext.OnceValues(getDeclaredDepModuleKeysB5), + ctx: ctx, + getBucket: syncOnceValuesGetBucketWithStorageMatcherApplied, + bucketID: bucketID, + moduleFullName: moduleFullName, + commitID: commitID, + isTarget: isTarget, + isLocal: isLocal, + getV1BufYAMLObjectData: syncext.OnceValues(getV1BufYAMLObjectData), + getV1BufLockObjectData: syncext.OnceValues(getV1BufLockObjectData), + getRemoteB5Digest: syncext.OnceValues(getRemoteB5Digest), } moduleReadBucket, err := newModuleReadBucketForModule( ctx, @@ -399,16 +399,15 @@ func (m *module) ModuleSet() ModuleSet { func (m *module) withIsTarget(isTarget bool) (Module, error) { // We don't just call newModule directly as we don't want to double syncext.OnceValues stuff. newModule := &module{ - ctx: m.ctx, - getBucket: m.getBucket, - bucketID: m.bucketID, - moduleFullName: m.moduleFullName, - commitID: m.commitID, - isTarget: isTarget, - isLocal: m.isLocal, - getV1BufYAMLObjectData: m.getV1BufYAMLObjectData, - getV1BufLockObjectData: m.getV1BufLockObjectData, - getDeclaredDepModuleKeysB5: m.getDeclaredDepModuleKeysB5, + ctx: m.ctx, + getBucket: m.getBucket, + bucketID: m.bucketID, + moduleFullName: m.moduleFullName, + commitID: m.commitID, + isTarget: isTarget, + isLocal: m.isLocal, + getV1BufYAMLObjectData: m.getV1BufYAMLObjectData, + getV1BufLockObjectData: m.getV1BufLockObjectData, } moduleReadBucket, ok := m.ModuleReadBucket.(*moduleReadBucket) if !ok { @@ -452,34 +451,22 @@ func newGetDigestFuncForModuleAndDigestType(module *module, digestType DigestTyp } return getB4Digest(module.ctx, bucket, v1BufYAMLObjectData, v1BufLockObjectData) case DigestTypeB5: + if !module.isLocal { + // For remote modules to have consistent B5 digests, they must not change their + // dependencies -- the considered dependencies must be exactly the dependencies the remote module was originally + // built with. This means that we cannot consider the modules within the current ModuleSet, as those may not + // match. Instead, there are two ways to do this deterministically: + // 1. Construct a new ModuleSet with just this module and its declared deps. + // 2. Fetch the module's digest remotely from the BSR. + // + // Given that we are not tamper proofing here (tamper-proofing is done at the ModuleData level), we opt to fetch + // the digest remotely from the BSR. + return module.getRemoteB5Digest() + } moduleDeps, err := module.ModuleDeps() if err != nil { return nil, err } - // For remote modules to have consistent B5 digests, they must not change the digests of their - // dependencies based on the local workspace. Use the pruned b5 module keys from - // ModuleData.DeclaredDepModuleKeys to calculate the digest. - if !module.isLocal { - declaredDepModuleKeys, err := module.getDeclaredDepModuleKeysB5() - if err != nil { - return nil, err - } - moduleDepFullNames := make(map[string]struct{}, len(moduleDeps)) - for _, dep := range moduleDeps { - fullName := dep.ModuleFullName() - if fullName == nil { - return nil, syserror.Newf("remote module dependencies should have full names") - } - moduleDepFullNames[fullName.String()] = struct{}{} - } - prunedDepModuleKeys := make([]ModuleKey, 0, len(declaredDepModuleKeys)) - for _, dep := range declaredDepModuleKeys { - if _, ok := moduleDepFullNames[dep.ModuleFullName().String()]; ok { - prunedDepModuleKeys = append(prunedDepModuleKeys, dep) - } - } - return getB5DigestForBucketAndDepModuleKeys(module.ctx, bucket, prunedDepModuleKeys) - } return getB5DigestForBucketAndModuleDeps(module.ctx, bucket, moduleDeps) default: return nil, syserror.Newf("unknown DigestType: %v", digestType) diff --git a/private/bufpkg/bufmodule/module_set_builder.go b/private/bufpkg/bufmodule/module_set_builder.go index fe39aa3755..1cbb9df86e 100644 --- a/private/bufpkg/bufmodule/module_set_builder.go +++ b/private/bufpkg/bufmodule/module_set_builder.go @@ -335,10 +335,10 @@ func (b *moduleSetBuilder) AddLocalModule( true, func() (ObjectData, error) { return localModuleOptions.v1BufYAMLObjectData, nil }, func() (ObjectData, error) { return localModuleOptions.v1BufLockObjectData, nil }, - func() ([]ModuleKey, error) { + func() (Digest, error) { // See comment in added_module.go when we construct remote Modules for why // we have this function in the first place. - return nil, syserror.Newf("getDeclaredDepModuleKeysB5 should never be called for a local Module") + return nil, syserror.Newf("getRemoteB5Digest should never be called for a local Module") }, localModuleOptions.targetPaths, localModuleOptions.targetExcludePaths, From 15840742a73c8c9da16bf179c27f8f696126b56c Mon Sep 17 00:00:00 2001 From: Saquib Mian Date: Thu, 25 Jul 2024 11:35:54 -0400 Subject: [PATCH 2/3] cleanup --- private/bufpkg/bufmodule/added_module.go | 46 +++++++++++++++--------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/private/bufpkg/bufmodule/added_module.go b/private/bufpkg/bufmodule/added_module.go index ee5e241c25..8dcc08e9ee 100644 --- a/private/bufpkg/bufmodule/added_module.go +++ b/private/bufpkg/bufmodule/added_module.go @@ -192,25 +192,39 @@ func (a *addedModule) ToModule( // module-b:c2 -> module-c:c2 // module-c:c1 -> module-d:c1 // - // Note that module-a depends on an earlier commit of module-b, which in turn depends on an earlier commit of - // module-c, but module-a overrides the commit for module-c with the latest commit. Note also that the latest commit - // of module-c drops the dependency on module-d. + // Note that module-a depends on module-b:c1, which in turn depends on module-c:c1, but module-a also depends on + // module-c:c2. When resolving the dependency graph, module-c:c1 is replaced with the newer module-c:c2. The commit + // module-c:c2 doesn't depend on module-d, so module-d:c1 is dropped. Graphically, this graph: // - // The unreduced graph of dependencies for module-a looks like: + // +-------------+ +-------------+ +-------------+ +-------------+ + // | | | | | | | | + // | module-a:c1 |--->| module-b:c1 |--->| module-c:c1 |--->| module-d:c1 | + // | | | | | | | | + // +-------------+ +-------------+ +-------------+ +-------------+ + // \ + // \ + // \ + // \ +-------------+ + // \ | | + // ------>| module-c:c2 | + // | | + // +-------------+ // - // b:c1 - c:c1 - d:c1 - // / - // a - // \ - // c:c2 + // is pruned to this graph: // - // Due to dependency resolution, module-c:c1 would be overriden by module-c:c2, and this graph would reduce down to: - // - // b:c1 - // / - // a - // \ - // c:c2 + // +-------------+ +-------------+ + // | | | | + // | module-a:c1 |--->| module-b:c1 | + // | | | | + // +-------------+ +-------------+ + // \ | + // \ | + // \ v + // \ +-------------+ + // \ | | + // ------>| module-c:c2 | + // | | + // +-------------+ // // Calculating the b5 digest for module-a would result in the calculation: // From c31f02fe34b11508d73ff56e951d282667070960 Mon Sep 17 00:00:00 2001 From: Saquib Mian Date: Thu, 25 Jul 2024 12:02:58 -0400 Subject: [PATCH 3/3] fix typo --- private/bufpkg/bufmodule/added_module.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/private/bufpkg/bufmodule/added_module.go b/private/bufpkg/bufmodule/added_module.go index 8dcc08e9ee..3e6d0e3d26 100644 --- a/private/bufpkg/bufmodule/added_module.go +++ b/private/bufpkg/bufmodule/added_module.go @@ -280,7 +280,7 @@ func (a *addedModule) ToModule( // // module-d:c1 digest // DIGEST( // README.md, - // x.proto, + // h.proto, // ), // ), // ),