Skip to content

Commit

Permalink
In name generators, process Unicode characters as UTF-8. Update test_…
Browse files Browse the repository at this point in the history
…name_generator_md5, test_name_generator_sha1.
  • Loading branch information
pdimov committed May 5, 2024
1 parent 0029502 commit d240d0a
Show file tree
Hide file tree
Showing 3 changed files with 263 additions and 110 deletions.
139 changes: 110 additions & 29 deletions include/boost/uuid/detail/basic_name_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,24 @@ class basic_name_generator

uuid namespace_uuid_;

private:

using digest_type = typename HashAlgo::digest_type;

public:

using result_type = uuid;
using digest_type = typename HashAlgo::digest_type;

explicit basic_name_generator( uuid const& namespace_uuid ) noexcept
: namespace_uuid_( namespace_uuid )
{}

uuid operator()( char const* name ) const noexcept
{
HashAlgo hash;

hash.process_bytes( namespace_uuid_.begin(), namespace_uuid_.size() );
process_characters( hash, name, std::strlen( name ) );

return hash_to_uuid( hash );
}

uuid operator()( wchar_t const* name ) const noexcept
template<class Ch> uuid operator()( Ch const* name ) const noexcept
{
HashAlgo hash;

hash.process_bytes( namespace_uuid_.begin(), namespace_uuid_.size() );
process_characters( hash, name, std::wcslen( name ) );
process_characters( hash, name, std::char_traits<Ch>().length( name ) );

return hash_to_uuid( hash );
}
Expand All @@ -80,32 +73,120 @@ class basic_name_generator
}

private:
// we convert all characters to uint32_t so that each
// character is 4 bytes regardless of sizeof(char) or
// sizeof(wchar_t). We want the name string on any
// platform / compiler to generate the same uuid
// except for char
template<class Ch>
void process_characters( HashAlgo& hash, Ch const* characters, std::size_t count ) const noexcept

void process_characters( HashAlgo& hash, char const* p, std::size_t n ) const noexcept
{
hash.process_bytes( p, n );
}

// For portability, we convert all wide characters to uint32_t so that each
// character is 4 bytes regardless of sizeof(wchar_t).

void process_characters( HashAlgo& hash, wchar_t const* p, std::size_t n ) const noexcept
{
BOOST_UUID_STATIC_ASSERT( sizeof(std::uint32_t) >= sizeof(Ch) );
BOOST_UUID_STATIC_ASSERT( sizeof( std::uint32_t ) >= sizeof( wchar_t ) );

for( std::size_t i = 0; i < count; ++i)
for( std::size_t i = 0; i < n; ++i)
{
std::size_t c = characters[ i ];
std::uint32_t ch = p[ i ];

unsigned char bytes[ 4 ] =
{
static_cast<unsigned char>( ( ch >> 0 ) & 0xFF ),
static_cast<unsigned char>( ( ch >> 8 ) & 0xFF ),
static_cast<unsigned char>( ( ch >> 16 ) & 0xFF ),
static_cast<unsigned char>( ( ch >> 24 ) & 0xFF )
};

hash.process_byte( static_cast<unsigned char>( (c >> 0) & 0xFF ) );
hash.process_byte( static_cast<unsigned char>( (c >> 8) & 0xFF ) );
hash.process_byte( static_cast<unsigned char>( (c >> 16) & 0xFF ) );
hash.process_byte( static_cast<unsigned char>( (c >> 24) & 0xFF ) );
hash.process_bytes( bytes, 4 );
}
}

void process_characters( HashAlgo& hash, char const* characters, std::size_t count ) const noexcept
void process_characters( HashAlgo& hash, char32_t const* p, std::size_t n ) const noexcept
{
hash.process_bytes( characters, count );
for( std::size_t i = 0; i < n; ++i)
{
process_utf32_codepoint( hash, p[ i ] );
}
}

void process_characters( HashAlgo& hash, char16_t const* p, std::size_t n ) const noexcept
{
for( std::size_t i = 0; i < n; ++i)
{
char16_t ch = p[ i ];

if( ch >= 0xD800 && ch <= 0xDBFF && i + 1 < n && p[ i+1 ] >= 0xDC00 && p[ i+1 ] <= 0xDFFF )
{
char16_t ch2 = p[ ++i ];

std::uint32_t high = ch - 0xD800;
std::uint32_t low = ch2 - 0xDC00;

process_utf32_codepoint( hash, ( high << 10 ) + low + 0x10000 );
}
else
{
process_utf32_codepoint( hash, ch );
}
}
}

void process_utf32_codepoint( HashAlgo& hash, std::uint32_t cp ) const noexcept
{
if( ( cp >= 0xD800 && cp <= 0xDFFF ) || cp > 0x10FFFF )
{
cp = 0xFFFD; // Unicode replacement character
}

if( cp < 0x80 )
{
hash.process_byte( static_cast<unsigned char>( cp ) );
}
else if( cp < 0x800 )
{
unsigned char bytes[ 2 ] =
{
static_cast<unsigned char>( 0xC0 | (cp >> 6) ),
static_cast<unsigned char>( 0x80 | (cp & 0x3F) )
};

hash.process_bytes( bytes, 2 );
}
else if( cp < 0x10000 )
{
unsigned char bytes[ 3 ] =
{
static_cast<unsigned char>( 0xE0 | (cp >> 12) ),
static_cast<unsigned char>( 0x80 | ((cp >> 6) & 0x3F) ),
static_cast<unsigned char>( 0x80 | (cp & 0x3F) )
};

hash.process_bytes( bytes, 3 );
}
else
{
unsigned char bytes[ 4 ] =
{
static_cast<unsigned char>( 0xF0 | ( cp >> 18 ) ),
static_cast<unsigned char>( 0x80 | ((cp >> 12 ) & 0x3F ) ),
static_cast<unsigned char>( 0x80 | ((cp >> 6 ) & 0x3F ) ),
static_cast<unsigned char>( 0x80 | (cp & 0x3F) )
};

hash.process_bytes( bytes, 4 );
}
}

#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L

void process_characters( HashAlgo& hash, char8_t const* p, std::size_t n ) const noexcept
{
hash.process_bytes( p, n );
}

#endif

uuid hash_to_uuid( HashAlgo& hash ) const noexcept
{
digest_type digest;
Expand Down
116 changes: 76 additions & 40 deletions test/test_name_generator_md5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,70 @@
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <boost/core/lightweight_test.hpp>
#include <string>

int main()
{
using namespace boost::uuids;
using namespace boost::uuids;

// RFC 4122 Appendix B + Errata 1352
void test( uuid const& nmsp, char const* name, char const* expected )
{
name_generator_md5 gen( nmsp );

{
name_generator_md5 gen( ns::dns() );
uuid u0 = string_generator()( expected );

char const* name = "www.widgets.com";
uuid u1 = gen( name );
BOOST_TEST_EQ( u1, u0 );

uuid u0 = string_generator()( "3d813cbb-47fb-32ba-91df-831e1593ac29" );
uuid u2 = gen( std::string( name ) );
BOOST_TEST_EQ( u2, u0 );

uuid u1 = gen( name );
BOOST_TEST_EQ( u1, u0 );
uuid u3 = gen( name, std::strlen( name ) );
BOOST_TEST_EQ( u3, u0 );
}

uuid u2 = gen( std::string( name ) );
BOOST_TEST_EQ( u2, u0 );
template<class Ch> void test( uuid const& nmsp, Ch const* name, char const* expected )
{
name_generator_md5 gen( nmsp );

uuid u3 = gen( name, std::strlen( name ) );
BOOST_TEST_EQ( u3, u0 );
}
uuid u0 = string_generator()( expected );

// RFC 4122bis Section A.2
uuid u1 = gen( name );
BOOST_TEST_EQ( u1, u0 );

{
name_generator_md5 gen( ns::dns() );
uuid u2 = gen( std::basic_string<Ch>( name ) );
BOOST_TEST_EQ( u2, u0 );
}

char const* name = "www.example.com";
int main()
{
// RFC 4122 Appendix B + Errata 1352

uuid u0 = string_generator()( "5df41881-3aed-3515-88a7-2f4a814cf09e" );
test( ns::dns(), "www.widgets.com", "3d813cbb-47fb-32ba-91df-831e1593ac29" );
test( ns::dns(), u"www.widgets.com", "3d813cbb-47fb-32ba-91df-831e1593ac29" );
test( ns::dns(), U"www.widgets.com", "3d813cbb-47fb-32ba-91df-831e1593ac29" );
test( ns::dns(), u8"www.widgets.com", "3d813cbb-47fb-32ba-91df-831e1593ac29" );

uuid u1 = gen( name );
BOOST_TEST_EQ( u1, u0 );

uuid u2 = gen( std::string( name ) );
BOOST_TEST_EQ( u2, u0 );
// RFC 4122bis Section A.2

uuid u3 = gen( name, std::strlen( name ) );
BOOST_TEST_EQ( u3, u0 );
}
test( ns::dns(), "www.example.com", "5df41881-3aed-3515-88a7-2f4a814cf09e" );
test( ns::dns(), u"www.example.com", "5df41881-3aed-3515-88a7-2f4a814cf09e" );
test( ns::dns(), U"www.example.com", "5df41881-3aed-3515-88a7-2f4a814cf09e" );
test( ns::dns(), u8"www.example.com", "5df41881-3aed-3515-88a7-2f4a814cf09e" );

// https://uuid.ramsey.dev/en/stable/rfc4122/version3.html

{
name_generator_md5 gen( ns::url() );

char const* name = "https://www.php.net";
test( ns::url(), "https://www.php.net", "3f703955-aaba-3e70-a3cb-baff6aa3b28f" );
test( ns::url(), u"https://www.php.net", "3f703955-aaba-3e70-a3cb-baff6aa3b28f" );
test( ns::url(), U"https://www.php.net", "3f703955-aaba-3e70-a3cb-baff6aa3b28f" );
test( ns::url(), u8"https://www.php.net", "3f703955-aaba-3e70-a3cb-baff6aa3b28f" );

uuid u0 = string_generator()( "3f703955-aaba-3e70-a3cb-baff6aa3b28f" );
// test case from test_name_generator.cpp

uuid u1 = gen( name );
BOOST_TEST_EQ( u1, u0 );
test( ns::url(), "www.widgets.com", "06205cec-255b-300e-a8bc-a8605ab8244e" );

uuid u2 = gen( std::string( name ) );
BOOST_TEST_EQ( u2, u0 );
// examples from documentation

uuid u3 = gen( name, std::strlen( name ) );
BOOST_TEST_EQ( u3, u0 );
}
test( ns::dns(), "boost.org", "888eca9c-e655-31a2-a46b-a2a821f6b150" );
test( ns::dns(), L"boost.org", "48149232-8cda-361b-b355-0bdb71d2cab3" );

// test wide strings

Expand All @@ -85,5 +88,38 @@ int main()
BOOST_TEST_EQ( u1, u2 );
}

// test unicode strings

{
uuid nmsp = string_generator()( "70a4abc5-80ab-4176-8e11-bc5836b6fef9" );

name_generator_md5 gen( nmsp );

char32_t const name32[] = { 0x0024, 0x00A3, 0x0418, 0x0939, 0x20AC, 0xD55C, 0xDC12, 0xD834, 0x10348, 0x1096B3, 0xD956, 0 };
char16_t const name16[] = { 0x0024, 0x00A3, 0x0418, 0x0939, 0x20AC, 0xD55C, 0xDC12, 0xD834, 0xD800, 0xDF48, 0xDBE5, 0xDEB3, 0xD956, 0 };

unsigned char name8[] =
{
/*U+0024*/ 0x24,
/*U+00A3*/ 0xC2, 0xA3,
/*U+0418*/ 0xD0, 0x98,
/*U+0939*/ 0xE0, 0xA4, 0xB9,
/*U+20AC*/ 0xE2, 0x82, 0xAC,
/*U+D55C*/ 0xED, 0x95, 0x9C,
/*U+DC12*/ 0xEF, 0xBF, 0xBD, // U+FFFD
/*U+D834*/ 0xEF, 0xBF, 0xBD, // U+FFFD
/*U+10348*/ 0xF0, 0x90, 0x8D, 0x88,
/*U+1096B3*/ 0xF4, 0x89, 0x9A, 0xB3,
/*U+D956*/ 0xEF, 0xBF, 0xBD, // U+FFFD
};

uuid u1 = gen( name32 );
uuid u2 = gen( name16 );
uuid u3 = gen( name8, sizeof( name8 ) );

BOOST_TEST_EQ( u1, u3 );
BOOST_TEST_EQ( u2, u3 );
}

return boost::report_errors();
}
Loading

0 comments on commit d240d0a

Please sign in to comment.