From d5c3a86252e6e98e7f0d324132933d20ac040ae1 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Sun, 3 Dec 2023 11:35:48 -0500 Subject: [PATCH] Capture a relationship between branch and branch commits (#6) One commit can also be related to multiple branches. This does slow down the sync process, so only syncing for default branch. Still slow after that, so can wait to introduce this --- lib/branch_base/database.rb | 26 ++++++++------- lib/branch_base/repository.rb | 24 ++++++++++++-- lib/branch_base/sync.rb | 54 +++++++++++++++++++++---------- spec/branch_base/database_spec.rb | 1 + spec/branch_base/sync_spec.rb | 46 ++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 31 deletions(-) diff --git a/lib/branch_base/database.rb b/lib/branch_base/database.rb index 517edd9..3ed50a3 100644 --- a/lib/branch_base/database.rb +++ b/lib/branch_base/database.rb @@ -43,21 +43,20 @@ def setup_schema FOREIGN KEY (repo_id) REFERENCES repositories (repo_id) ); - CREATE INDEX IF NOT EXISTS idx_commits_repo_id ON commits (repo_id); - CREATE INDEX IF NOT EXISTS idx_commits_author ON commits (author); - CREATE INDEX IF NOT EXISTS idx_commits_committer ON commits (committer); - CREATE TABLE IF NOT EXISTS branches ( branch_id INTEGER PRIMARY KEY AUTOINCREMENT, repo_id INTEGER NOT NULL, name TEXT NOT NULL, - head_commit TEXT NOT NULL, - FOREIGN KEY (repo_id) REFERENCES repositories (repo_id), - FOREIGN KEY (head_commit) REFERENCES commits (commit_hash) + FOREIGN KEY (repo_id) REFERENCES repositories (repo_id) ); - CREATE INDEX IF NOT EXISTS idx_branches_repo_id ON branches (repo_id); - CREATE INDEX IF NOT EXISTS idx_branches_head_commit ON branches (head_commit); + CREATE TABLE IF NOT EXISTS branch_commits ( + branch_id INTEGER NOT NULL, + commit_hash TEXT NOT NULL, + PRIMARY KEY (branch_id, commit_hash), + FOREIGN KEY (branch_id) REFERENCES branches (branch_id), + FOREIGN KEY (commit_hash) REFERENCES commits (commit_hash) + ); CREATE TABLE IF NOT EXISTS files ( file_id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -68,9 +67,6 @@ def setup_schema FOREIGN KEY (latest_commit) REFERENCES commits (commit_hash) ); - CREATE INDEX IF NOT EXISTS idx_files_repo_id ON files (repo_id); - CREATE INDEX IF NOT EXISTS idx_files_file_path ON files (file_path); - CREATE TABLE IF NOT EXISTS commit_files ( commit_hash TEXT NOT NULL, file_id INTEGER NOT NULL, @@ -88,6 +84,12 @@ def setup_schema FOREIGN KEY (parent_hash) REFERENCES commits (commit_hash) ); + CREATE INDEX IF NOT EXISTS idx_commits_repo_id ON commits (repo_id); + CREATE INDEX IF NOT EXISTS idx_commits_author ON commits (author); + CREATE INDEX IF NOT EXISTS idx_commits_committer ON commits (committer); + CREATE INDEX IF NOT EXISTS idx_branches_repo_id ON branches (repo_id); + CREATE INDEX IF NOT EXISTS idx_files_repo_id ON files (repo_id); + CREATE INDEX IF NOT EXISTS idx_files_file_path ON files (file_path); CREATE INDEX IF NOT EXISTS idx_commit_parents_commit_hash ON commit_parents (commit_hash); CREATE INDEX IF NOT EXISTS idx_commit_parents_parent_hash ON commit_parents (parent_hash); SQL diff --git a/lib/branch_base/repository.rb b/lib/branch_base/repository.rb index 2f789de..e39c7d4 100644 --- a/lib/branch_base/repository.rb +++ b/lib/branch_base/repository.rb @@ -4,12 +4,32 @@ module BranchBase class Repository + attr_reader :repo + def initialize(repo_path) @repo = Rugged::Repository.new(repo_path) end - def walk(&block) - @repo.walk(@repo.head.target.oid, Rugged::SORT_TOPO, &block) + def walk(branch_name = nil, &block) + # Use the provided branch's head commit OID if a branch name is given, + # otherwise, use the repository's HEAD commit OID. + oid = + if branch_name + branch = @repo.branches[branch_name] + raise ArgumentError, "Branch not found: #{branch_name}" unless branch + branch.target.oid + else + @repo.head.target.oid + end + + @repo.walk(oid, Rugged::SORT_TOPO, &block) + end + + def default_branch_name + head_ref = @repo.head.name + head_ref.sub(%r{^refs/heads/}, "") + rescue Rugged::ReferenceError + nil end def path diff --git a/lib/branch_base/sync.rb b/lib/branch_base/sync.rb index 4e581cd..7d638d4 100644 --- a/lib/branch_base/sync.rb +++ b/lib/branch_base/sync.rb @@ -39,32 +39,52 @@ def sync_repository end def sync_branches(repo_id) - BranchBase.logger.debug( - "Syncing branches for repository ID: #{@repo.path}", - ) + BranchBase.logger.debug("Syncing branches for repository ID: #{repo_id}") - batched_branches = [] + default_branch_name = @repo.default_branch_name + return unless default_branch_name @repo.branches.each do |branch| next if branch.name.nil? || branch.target.nil? - commit_oid = - ( - if branch.target.respond_to?(:oid) - branch.target.oid - else - branch.target.target.oid - end - ) - batched_branches << [repo_id, branch.name, commit_oid] + branch_id = insert_branch(repo_id, branch.name) - if batched_branches.size >= BATCH_SIZE - insert_branches(batched_branches) - batched_branches.clear + if branch.name == default_branch_name + insert_branch_commits(branch_id, branch) end end + end + + def insert_branch(repo_id, branch_name) + existing_branch_id = + @db.execute( + "SELECT branch_id FROM branches WHERE name = ? AND repo_id = ?", + [branch_name, repo_id], + ).first + return existing_branch_id[0] if existing_branch_id - insert_branches(batched_branches) unless batched_branches.empty? + @db.execute( + "INSERT INTO branches (repo_id, name) VALUES (?, ?)", + [repo_id, branch_name], + ) + @db.last_insert_row_id + end + + def insert_branch_commits(branch_id, branch) + BranchBase.logger.debug("Syncing branch commits for: #{branch.name}") + + head_commit = branch.target + walker = Rugged::Walker.new(@repo.repo) + walker.push(head_commit) + + walker.each do |commit| + next if commit_exists?(commit.oid) + + @db.execute( + "INSERT OR IGNORE INTO branch_commits (branch_id, commit_hash) VALUES (?, ?)", + [branch_id, commit.oid], + ) + end end def sync_commits(repo_id) diff --git a/spec/branch_base/database_spec.rb b/spec/branch_base/database_spec.rb index c2733b5..91accbc 100644 --- a/spec/branch_base/database_spec.rb +++ b/spec/branch_base/database_spec.rb @@ -16,6 +16,7 @@ commit_files commit_parents sqlite_sequence + branch_commits ] expect(tables.flatten).to match_array(expected_tables) end diff --git a/spec/branch_base/sync_spec.rb b/spec/branch_base/sync_spec.rb index 49cc123..b29a5f9 100644 --- a/spec/branch_base/sync_spec.rb +++ b/spec/branch_base/sync_spec.rb @@ -105,4 +105,50 @@ end end end + + describe "#sync_branch_commits" do + it "associates commits only with the default branch" do + sync.sync_branches(@repo_id) + sync.sync_commits(@repo_id) + + default_branch_name = repo.default_branch_name + default_branch_id = + db + .execute( + "SELECT branch_id FROM branches WHERE repo_id = ? AND name = ?", + [@repo_id, default_branch_name], + ) + .first + &.first + + git_commits = [] + repo.walk(default_branch_name) { |commit| git_commits << commit.oid } + + db_commit_hashes = + db.execute( + "SELECT commit_hash FROM branch_commits WHERE branch_id = ?", + default_branch_id, + ).flatten + + expect(git_commits.size).to eq(db_commit_hashes.size) + git_commits.each do |commit_oid| + expect(db_commit_hashes).to include(commit_oid) + end + + other_branch_ids = + db.execute( + "SELECT branch_id FROM branches WHERE repo_id = ? AND name != ?", + [@repo_id, default_branch_name], + ).flatten + + other_branch_ids.each do |branch_id| + other_branch_commit_hashes = + db.execute( + "SELECT commit_hash FROM branch_commits WHERE branch_id = ?", + branch_id, + ).flatten + expect(other_branch_commit_hashes).to be_empty + end + end + end end