Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions ChecksumCalculator/ChecksumCalculator.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

Microsoft Visual Studio Solution File, Format Version 12.00
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ChecksumCalculator", "ChecksumCalculator\ChecksumCalculator.csproj", "{E5B72DED-8E4D-4570-B3F1-A94FB83B7EE1}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ChecksumCalculatorTests", "ChecksumCalculatorTests\ChecksumCalculatorTests.csproj", "{3B376E7C-E6A1-4A70-B5D4-79C68AB4A39B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{E5B72DED-8E4D-4570-B3F1-A94FB83B7EE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E5B72DED-8E4D-4570-B3F1-A94FB83B7EE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E5B72DED-8E4D-4570-B3F1-A94FB83B7EE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E5B72DED-8E4D-4570-B3F1-A94FB83B7EE1}.Release|Any CPU.Build.0 = Release|Any CPU
{3B376E7C-E6A1-4A70-B5D4-79C68AB4A39B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3B376E7C-E6A1-4A70-B5D4-79C68AB4A39B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3B376E7C-E6A1-4A70-B5D4-79C68AB4A39B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3B376E7C-E6A1-4A70-B5D4-79C68AB4A39B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal
21 changes: 21 additions & 0 deletions ChecksumCalculator/ChecksumCalculator/ChecksumCalculator.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="StyleCop.Analyzers" Version="1.2.0-beta.556">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<AdditionalFiles Include="stylecop.json" />
</ItemGroup>

</Project>
124 changes: 124 additions & 0 deletions ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// <copyright file="ParallelChecksumCalc.cs" company="khusainovilas">
// Copyright (c) khusainovilas. All rights reserved.
// </copyright>

namespace ChecksumCalculator;

using System.Security.Cryptography;
using System.Text;

/// <summary>
/// Calculates a deterministic MD5 checksum of a directory in multithreaded mode.
/// </summary>
public class ParallelChecksumCalc
{
private static readonly int MaxDegreeOfParallelism = Environment.ProcessorCount;
private static readonly SemaphoreSlim FileReadSemaphore = new(MaxDegreeOfParallelism, MaxDegreeOfParallelism);
Comment on lines +15 to +16

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я бы такими низкоуровневыми вещами не занимался, а оставил бы это штатному пулу потоков. Это не ошибка, но в целом пустая трата усилий.


/// <summary>
/// Asynchronously calculates a directory checksum using parallelism.
/// </summary>
/// <param name="directoryPath">Directory path.</param>
/// <param name="cancellationToken">Operation cancellation token.</param>
/// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
public async Task<byte[]> ComputeChecksumAsync(string directoryPath, CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(directoryPath);

var fullPath = Path.GetFullPath(directoryPath);

if (!Directory.Exists(fullPath))
{
throw new DirectoryNotFoundException($"Directory not found: {fullPath}");
}

return await this.ComputeDirectoryHashAsync(fullPath, cancellationToken)
.ConfigureAwait(false);
}

/// <summary>
/// Asynchronously calculates the checksum and returns it as a lowercase string.
/// </summary>
/// <param name="directoryPath">Directory path.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
public async Task<string> ComputeChecksumBase64Async(string directoryPath, CancellationToken cancellationToken = default)
{
var hash = await this.ComputeChecksumAsync(directoryPath, cancellationToken).ConfigureAwait(false);

return Convert.ToBase64String(hash);
}

private static async ValueTask SemaphoreSlimWaitAsync(CancellationToken cancellationToken)
{
await FileReadSemaphore.WaitAsync(cancellationToken).ConfigureAwait(false);
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Используйте =>


private async Task<byte[]> ComputeDirectoryHashAsync(
string directoryPath,
CancellationToken cancellationToken)
{
var directoryName = Path.GetFileName(directoryPath);
if (string.IsNullOrEmpty(directoryName))
{
directoryName = directoryPath;
}

var nameBytes = Encoding.UTF8.GetBytes(directoryName);

var entries = Directory.GetFileSystemEntries(directoryPath)
.OrderBy(Path.GetFileName, StringComparer.Ordinal)
.ToArray();

var childHashTasks = new List<Task<byte[]>>(entries.Length);

foreach (var entryPath in entries)
{
childHashTasks.Add(
Directory.Exists(entryPath)
? this.ComputeDirectoryHashAsync(entryPath, cancellationToken)
: this.ComputeFileHashAsync(entryPath, cancellationToken));
}

var childHashes = await Task.WhenAll(childHashTasks).ConfigureAwait(false);

var totalLength = nameBytes.Length + childHashes.Sum(hash => hash.Length);
var combinedBuffer = new byte[totalLength];

Buffer.BlockCopy(nameBytes, 0, combinedBuffer, 0, nameBytes.Length);

var currentOffset = nameBytes.Length;
foreach (var childHash in childHashes)
{
Buffer.BlockCopy(childHash, 0, combinedBuffer, currentOffset, childHash.Length);
currentOffset += childHash.Length;
}

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-windows

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-windows

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-windows

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-windows

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Check warning on line 96 in ChecksumCalculator/ChecksumCalculator/ParallelChecksumCalc.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Стоило бы поправить предупреждения, а то по делу ведь. И по правилам собираться должно без предупреждений.

using var md5 = MD5.Create();
return md5.ComputeHash(combinedBuffer);
Comment on lines +97 to +98

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Есть у MD5 статический метод, который делает то же самое (правда, называется как-то по-другому). Объект MD5 тут низачем.

}

private async Task<byte[]> ComputeFileHashAsync(string filePath, CancellationToken cancellationToken)
{
var fileName = Path.GetFileName(filePath);
var nameBytes = Encoding.UTF8.GetBytes(fileName);

await SemaphoreSlimWaitAsync(cancellationToken);

try
{
await using var fileStream = File.OpenRead(filePath);
using var memoryStream = new MemoryStream(nameBytes.Length + (int)fileStream.Length);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так Вы весть файл в память зачитаете, что не очень, если это какая-нибудь метагеномная сборка в пару терабайт размером.


memoryStream.Write(nameBytes);
await fileStream.CopyToAsync(memoryStream, cancellationToken).ConfigureAwait(false);
memoryStream.Position = 0;
using var md5 = MD5.Create();
return await md5.ComputeHashAsync(memoryStream, cancellationToken).ConfigureAwait(false);
}
finally
{
FileReadSemaphore.Release();
}
}
}
41 changes: 41 additions & 0 deletions ChecksumCalculator/ChecksumCalculator/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// <copyright file="Program.cs" company="khusainovilas">
// Copyright (c) khusainovilas. All rights reserved.
// </copyright>

using System.Diagnostics;
using ChecksumCalculator;

if (args.Length == 0)
{
return;
}

var directoryPath = args[0];

if (!Directory.Exists(directoryPath))
{
Console.WriteLine($"Error: directory not found: {directoryPath}");
return;
}

var sequential = new SequentialChecksumCalc();
var parallel = new ParallelChecksumCalc();

Console.WriteLine($"Calculating checksum for: {Path.GetFullPath(directoryPath)}");
Console.WriteLine();

var sw = Stopwatch.StartNew();
var hashSeq = await sequential.ComputeChecksumBase64Async(directoryPath);
sw.Stop();
var timeSeq = sw.Elapsed.TotalSeconds;

sw.Restart();
var hashPar = await parallel.ComputeChecksumBase64Async(directoryPath);
sw.Stop();
var timePar = sw.Elapsed.TotalSeconds;

Console.WriteLine($"Sequential → {hashSeq} ({timeSeq:F3}s)");
Console.WriteLine($"Parallel → {hashPar} ({timePar:F3}s)");
Console.WriteLine();

Console.WriteLine(hashSeq == hashPar ? "Success: Hashes are identical" : "Failure: Hashes differ — something went wrong!");
109 changes: 109 additions & 0 deletions ChecksumCalculator/ChecksumCalculator/SequentialChecksumCalc.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// <copyright file="SequentialChecksumCalc.cs" company="khusainovilas">
// Copyright (c) khusainovilas. All rights reserved.
// </copyright>

namespace ChecksumCalculator;

using System.Security.Cryptography;
using System.Text;

/// <summary>
/// Calculates a MD5 checksum of a directory in sequential mode.
/// </summary>
public class SequentialChecksumCalc
{
private readonly MD5 md5 = MD5.Create();

/// <summary>
/// Asynchronously calculates a deterministic checksum of a directory (single-threaded across tasks).
/// </summary>
/// /// <param name="directoryPath">
/// Full or relative path to the directory for which the hash sum should be calculated.
/// </param>
/// <param name="cancellationToken">
/// Operation cancellation token. Allows you to abort a lengthy calculation.
/// </param>
/// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
public async Task<byte[]> ComputeChecksumAsync(string directoryPath, CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(directoryPath);

var fullPath = Path.GetFullPath(directoryPath);

if (!Directory.Exists(fullPath))
{
throw new DirectoryNotFoundException($"Directory not found: {fullPath}");
}

return await this.ComputeDirectoryHashAsync(fullPath, cancellationToken);
}

/// <summary>
/// Returns the hash as a lowercase hex string.
/// </summary>
/// <param name="directoryPath">
/// Full or relative path to the directory.
/// </param>
/// <param name="cancellationToken">
/// Operation cancellation token.
/// </param>
/// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
public async Task<string> ComputeChecksumBase64Async(string directoryPath, CancellationToken cancellationToken = default)
{
var hash = await this.ComputeChecksumAsync(directoryPath, cancellationToken).ConfigureAwait(false);

return Convert.ToBase64String(hash);
}

private async Task<byte[]> ComputeDirectoryHashAsync(string directoryPath, CancellationToken cancellationToken)
{
var name = Path.GetFileName(directoryPath);
if (string.IsNullOrEmpty(name))
{
name = directoryPath;
}

var nameBytes = Encoding.UTF8.GetBytes(name);

var entries = Directory.GetFileSystemEntries(directoryPath).OrderBy(Path.GetFileName, StringComparer.Ordinal).ToArray();

var childHashes = new List<byte[]>(entries.Length);

foreach (var entry in entries)
{
cancellationToken.ThrowIfCancellationRequested();

var childHash = Directory.Exists(entry) ? await this.ComputeDirectoryHashAsync(entry, cancellationToken) : await this.ComputeFileHashAsync(entry, cancellationToken);

childHashes.Add(childHash);
}

var totalLength = nameBytes.Length + childHashes.Sum(h => h.Length);
var buffer = new byte[totalLength];

Buffer.BlockCopy(nameBytes, 0, buffer, 0, nameBytes.Length);

var offset = nameBytes.Length;
foreach (var hash in childHashes)
{
Buffer.BlockCopy(hash, 0, buffer, offset, hash.Length);
offset += hash.Length;
}

return this.md5.ComputeHash(buffer);
}

private async Task<byte[]> ComputeFileHashAsync(string filePath, CancellationToken cancellationToken)
{
var nameBytes = Encoding.UTF8.GetBytes(Path.GetFileName(filePath));

await using var stream = File.OpenRead(filePath);
using var memoryStream = new MemoryStream(nameBytes.Length + (int)stream.Length);

memoryStream.Write(nameBytes);
await stream.CopyToAsync(memoryStream, cancellationToken);
memoryStream.Position = 0;

return await this.md5.ComputeHashAsync(memoryStream, cancellationToken);
}
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут довольно много чего можно унифицировать с параллельной версией (в т.ч. по смыслу — хеш от одного файла всё равно считается последовательно всегда), так что можно было применить немного архитектурной магии (паттерны "Стратегия", "Шаблонный метод" и т.п.) и сократить размер кода в полтора раза. Архитектурную магию мы ещё не проходили, так что это не ошибка, но можно было хоть попытаться :)

9 changes: 9 additions & 0 deletions ChecksumCalculator/ChecksumCalculator/stylecop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"$schema": "https://raw.githubusercontent.com/DotNetAnalyzers/StyleCopAnalyzers/master/StyleCop.Analyzers/StyleCop.Analyzers/Settings/stylecop.schema.json",
"settings": {
"documentationRules": {
"companyName": "khusainovilas",
"copyrightText": "Copyright (c) {companyName}. All rights reserved."
}
}
}
Loading