Skip to content

Commit

Permalink
Increase performance of hash function
Browse files Browse the repository at this point in the history
  • Loading branch information
keybraker committed Oct 2, 2024
1 parent f7ccad1 commit d4ad2c7
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.22'
go-version: '1.20'

- name: Build
run: |
Expand Down
10 changes: 2 additions & 8 deletions app/consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ func consumer(
geoLocation bool,
format string,
verbose bool,
totalFiles int,
duplicateStrategy string,
processedFiles *int64,
done chan<- struct{}) {
Expand All @@ -39,7 +38,6 @@ func consumer(
geoLocation,
format,
verbose,
totalFiles,
duplicateStrategy,
)

Expand All @@ -59,7 +57,6 @@ func processFileInfo(
geoLocation bool,
format string,
verbose bool,
totalFiles int,
duplicateStrategy string,
) {
var generatedPath string
Expand Down Expand Up @@ -93,9 +90,6 @@ func processFileInfo(
fileInfo.Path,
generatedPath,
verbose,
/*processedImages,*/
0,
totalFiles,
fileInfo.isDuplicate,
duplicateStrategy,
)
Expand All @@ -104,14 +98,14 @@ func processFileInfo(
}
}

func moveFile(sourcePath, destinationPath string, verbose bool /*processedImages *list.List,*/, processedFiles int, totalFiles int, isDuplicate bool, duplicateStrategy string) error {
func moveFile(sourcePath, destinationPath string, verbose bool, isDuplicate bool, duplicateStrategy string) error {
destPath := filepath.Dir(destinationPath)
if err := os.MkdirAll(destPath, os.ModePerm); err != nil {
return fmt.Errorf("failed to create destination directory %s: %v", destPath, err)
}

if verbose {
moveActionLog, err := logMoveAction(sourcePath, destPath, isDuplicate, duplicateStrategy, processedFiles, totalFiles)
moveActionLog, err := logMoveAction(sourcePath, destPath, isDuplicate, duplicateStrategy)
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions app/creator.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,13 @@ func processFile(
switch duplicateStrategy {
case "skip":
fmt.Printf("Skipped duplicate file: %v\n", path)
logMoveAction(path, "", true, duplicateStrategy, 0, 0)
logMoveAction(path, "", true, duplicateStrategy)
return
case "delete":
if err := os.Remove(path); err != nil {
errorQueue <- fmt.Errorf("failed to delete duplicate file: %v", err)
} else {
logMoveAction(path, "", true, duplicateStrategy, 0, 0)
logMoveAction(path, "", true, duplicateStrategy)
}
return
}
Expand Down
2 changes: 1 addition & 1 deletion app/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func infoHandler(infoQueue chan string) {
}
}

func logMoveAction(sourcePath, destinationDirectory string, isDuplicate bool, duplicateStrategy string, processedFiles int, totalFiles int) (string, error) {
func logMoveAction(sourcePath, destinationDirectory string, isDuplicate bool, duplicateStrategy string) (string, error) {
colorCode := "\033[32m"
actionName := "Moved (original)"

Expand Down
14 changes: 8 additions & 6 deletions app/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ func main() {
flag.Parse()
fileTypes := flagProcessor()

sourcePath := filepath.Clean(*inputPath)
destinationPath := filepath.Clean(*outputPath)
validatePaths(sourcePath, destinationPath)
sourcePath, destinationPath := validatePaths(*inputPath, *outputPath)

fileQueue := make(chan FileInfo, 100)
infoQueue := make(chan string, 50)
Expand Down Expand Up @@ -114,7 +112,6 @@ func main() {
*geoLocation,
*format,
*verbose,
totalFilesToMove,
*duplicateStrategy,
&processedFiles,
done,
Expand Down Expand Up @@ -142,7 +139,7 @@ func formatElapsedTime(elapsed time.Duration) string {
return fmt.Sprintf("%d minutes and %d seconds", minutes, seconds)
}

return fmt.Sprintf("%.2f seconds.", elapsed.Seconds())
return fmt.Sprintf("%.2f seconds", elapsed.Seconds())
}

func spinner(stopSpinner chan bool, verb string, processedFiles *int64, totalFiles int) {
Expand Down Expand Up @@ -269,7 +266,10 @@ func directoryExists(path string) error {
return nil
}

func validatePaths(sourcePath, destinationPath string) {
func validatePaths(inputPath, outputPath string) (string, string) {
sourcePath := filepath.Clean(inputPath)
destinationPath := filepath.Clean(outputPath)

if sourcePath == "" || destinationPath == "" {
logger(LoggerTypeFatal, "input and output paths must be supplied")
}
Expand All @@ -286,4 +286,6 @@ func validatePaths(sourcePath, destinationPath string) {
} else if err := directoryExists(destinationPath); err != nil {
logger(LoggerTypeFatal, err.Error())
}

return sourcePath, destinationPath
}
6 changes: 6 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ module github.com/keybraker/mediarizer-2
go 1.22

require github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd

require (
github.com/klauspost/cpuid/v2 v2.0.12 // indirect
github.com/zeebo/blake3 v0.2.4 // indirect
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
github.com/klauspost/cpuid/v2 v2.0.12 h1:p9dKCg8i4gmOxtv35DvrYoWqYzQrvEVdjQ762Y0OqZE=
github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd h1:CmH9+J6ZSsIjUK3dcGsnCnO41eRBOnY12zwkn5qVwgc=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk=
github.com/zeebo/blake3 v0.2.4 h1:KYQPkhpRtcqh0ssGYcKLG1JYvddkEA8QwCM/yBqhaZI=
github.com/zeebo/blake3 v0.2.4/go.mod h1:7eeQ6d2iXWRGF6npfaxl2CU+xy2Fjo2gxeyZGCRUjcE=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
93 changes: 71 additions & 22 deletions hash/hash.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,36 @@ import (
"strings"
"sync"
"sync/atomic"
"time"

"golang.org/x/exp/mmap"
)

type FileMeta struct {
Size int64
ModTime time.Time
}

type CachedFile struct {
FileMeta
Hash []byte
}

type readerAtWrapper struct {
readerAt io.ReaderAt
offset int64
size int64
}

func (r *readerAtWrapper) Read(p []byte) (n int, err error) {
if r.offset >= r.size {
return 0, io.EOF
}
n, err = r.readerAt.ReadAt(p, r.offset)
r.offset += int64(n)
return n, err
}

// isImageFile checks if the file is an image based on its extension.
func isImageFile(filePath string) bool {
lowerFilePath := strings.ToLower(filePath)
Expand All @@ -23,45 +51,70 @@ func isImageFile(filePath string) bool {

// calculateFileHash calculates the SHA-256 hash of the file at the given filePath.
func calculateFileHash(filePath string) ([]byte, error) {
file, err := os.Open(filePath)
readerAt, err := mmap.Open(filePath)
if err != nil {
return nil, fmt.Errorf("failed to memory-map file %s: %v", filePath, err)
}
defer readerAt.Close()

fileInfo, err := os.Stat(filePath)
if err != nil {
return nil, fmt.Errorf("failed to open file at %s: %v", filePath, err)
return nil, fmt.Errorf("failed to stat file %s: %v", filePath, err)
}
fileSize := fileInfo.Size()

reader := &readerAtWrapper{
readerAt: readerAt,
offset: 0,
size: fileSize,
}
defer file.Close()

hash := sha256.New()
if _, err := io.Copy(hash, file); err != nil {
return nil, fmt.Errorf("failed to calculate hash for file: %v", err)
if _, err := io.Copy(hash, reader); err != nil {
return nil, fmt.Errorf("failed to calculate hash for file %s: %v", filePath, err)
}

return hash.Sum(nil), nil
}

// GetFileHash retrieves or calculates the hash of the file at filePath.
func GetFileHash(filePath string, hashCache *sync.Map) ([]byte, error) {
if hash, found := hashCache.Load(filePath); found {
return hash.([]byte), nil
info, err := os.Stat(filePath)
if err != nil {
return nil, err
}
meta := FileMeta{Size: info.Size(), ModTime: info.ModTime()}

calculatedHash, err := calculateFileHash(filePath)
if cached, found := hashCache.Load(filePath); found {
cachedFile := cached.(CachedFile)
if cachedFile.Size == meta.Size && cachedFile.ModTime.Equal(meta.ModTime) {
return cachedFile.Hash, nil
}
}

hashValue, err := calculateFileHash(filePath)
if err != nil {
return nil, err
}

hashCache.Store(filePath, calculatedHash)
return calculatedHash, nil
cachedFile := CachedFile{
FileMeta: meta,
Hash: hashValue,
}
hashCache.Store(filePath, cachedFile)

return hashValue, nil
}

// hashImagesInPath hashes all images in the given path and updates the fileHashMap.
func HashImagesInPath(path string, hashCache *sync.Map, hashedFiles *int64) (*sync.Map, error) {
fileHashMap := &sync.Map{}
fileChan := make(chan string) // Channel to pass file paths to workers
errChan := make(chan error) // Channel to collect errors
var wg sync.WaitGroup // WaitGroup to track the worker goroutines
fileChan := make(chan string)
errChan := make(chan error)
var wg sync.WaitGroup

numWorkers := runtime.NumCPU() / 2
numWorkers := runtime.NumCPU() * 4

// Start worker goroutines
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
Expand All @@ -84,35 +137,31 @@ func HashImagesInPath(path string, hashCache *sync.Map, hashedFiles *int64) (*sy
}()
}

// Walk the directory and send file paths to the channel
go func() {
defer close(fileChan) // Close the channel when done
defer close(fileChan)
err := filepath.Walk(path, func(filePath string, info os.FileInfo, err error) error {
if err != nil {
errChan <- fmt.Errorf("failed to walk path %s: %v", filePath, err)
return err
}

if !info.IsDir() {
fileChan <- filePath // Send file to channel for hashing
fileChan <- filePath
}

return nil
})

// If an error occurred during filepath walk, send it to the error channel
if err != nil {
errChan <- err
}
}()

// Wait for all workers to finish
go func() {
wg.Wait()
close(errChan) // Close error channel when all workers are done
close(errChan)
}()

// Check for errors during execution
for err := range errChan {
if err != nil {
return nil, err
Expand Down

0 comments on commit d4ad2c7

Please sign in to comment.