diff --git a/include/natalie/encoding_object.hpp b/include/natalie/encoding_object.hpp index e6fc6d52f..97b8ed0e3 100644 --- a/include/natalie/encoding_object.hpp +++ b/include/natalie/encoding_object.hpp @@ -84,6 +84,11 @@ class EncodingObject : public Object { Replace, }; + enum class EncodeUndefOption { + Raise, + Replace, + }; + enum class EncodeNewlineOption { None, Cr, @@ -93,8 +98,10 @@ class EncodingObject : public Object { struct EncodeOptions { EncodeInvalidOption invalid_option = EncodeInvalidOption::Raise; + EncodeUndefOption undef_option = EncodeUndefOption::Raise; EncodeNewlineOption newline_option = EncodeNewlineOption::None; StringObject *replace_option = nullptr; + Value fallback_option = nullptr; }; virtual Value encode(Env *, EncodingObject *, StringObject *, EncodeOptions) const; diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 641d4e6c3..7dd434524 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -511,6 +511,7 @@ class StringObject : public Object { using EncodeOptions = EncodingObject::EncodeOptions; using EncodeInvalidOption = EncodingObject::EncodeInvalidOption; using EncodeNewlineOption = EncodingObject::EncodeNewlineOption; + using EncodeUndefOption = EncodingObject::EncodeUndefOption; String m_string {}; EncodingObject *m_encoding { nullptr }; diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index e141427de..66e66f107 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -92,63 +92,51 @@ end it "replace multiple invalid bytes at the end with a single replacement character" do - NATFIXME 'encode options' do + NATFIXME 'same encoding with invalid chars', exception: SpecFailedException, message: /should be ==/ do "\xE3\x81\x93\xE3\x81".encode("UTF-8", invalid: :replace).should == "\u3053\ufffd" end end it "replaces invalid encoding in source using a specified replacement even when a fallback is given" do - NATFIXME 'encode options' do - encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: -> c { "bar" }) - encoded.should == "\u3061foofoo".encode("UTF-16LE") - encoded.encode("UTF-8").should == "ちfoofoo" - end + encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: -> c { "bar" }) + encoded.should == "\u3061foofoo".encode("UTF-16LE") + encoded.encode("UTF-8").should == "ちfoofoo" end it "replaces undefined encoding in destination with default replacement" do - NATFIXME 'encode options' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace) - encoded.should == "B?".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "B?" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace) + encoded.should == "B?".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "B?" end it "replaces undefined encoding in destination with a specified replacement" do - NATFIXME 'encode options' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo") - encoded.should == "Bfoo".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bfoo" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo") + encoded.should == "Bfoo".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bfoo" end it "replaces undefined encoding in destination with a specified replacement even if a fallback is given" do - NATFIXME 'encode options' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo", fallback: proc {|x| "bar"}) - encoded.should == "Bfoo".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bfoo" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo", fallback: proc {|x| "bar"}) + encoded.should == "Bfoo".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bfoo" end it "replaces undefined encoding in destination using a fallback proc" do - NATFIXME 'encode fallback' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc {|x| "bar"}) - encoded.should == "Bbar".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bbar" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc {|x| "bar"}) + encoded.should == "Bbar".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bbar" end it "replaces invalid encoding in source using replace even when fallback is given as proc" do - NATFIXME 'encode options' do - encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: proc {|x| "bar"}) - encoded.should == "\u3061foofoo".encode("UTF-16LE") - encoded.encode("UTF-8").should == "ちfoofoo" - end + encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: proc {|x| "bar"}) + encoded.should == "\u3061foofoo".encode("UTF-16LE") + encoded.encode("UTF-8").should == "ちfoofoo" end end describe "when passed to, from" do it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8") @@ -159,7 +147,7 @@ end it "returns the transcoded string" do - NATFIXME 'not sure' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do str = "\x00\x00\x00\x1F" str.encode(Encoding::UTF_8, Encoding::UTF_16BE).should == "\u0000\u001f" end @@ -168,12 +156,10 @@ describe "when passed to, options" do it "returns a copy when the destination encoding is the same as the String encoding" do - NATFIXME 'encode options' do - str = "あ" - encoded = str.encode(Encoding::UTF_8, undef: :replace) - encoded.should_not equal(str) - encoded.should == str - end + str = "あ" + encoded = str.encode(Encoding::UTF_8, undef: :replace) + encoded.should_not equal(str) + encoded.should == str end end @@ -186,7 +172,7 @@ end it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8", invalid: :replace) diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index fce411b02..096e5a30e 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -9,7 +9,7 @@ end it "transcodes a 7-bit String despite no generic converting being available" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do -> do Encoding::Converter.new Encoding::Emacs_Mule, Encoding::BINARY end.should raise_error(Encoding::ConverterNotFoundError) @@ -22,7 +22,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible" do - NATFIXME 'no conversion' do + NATFIXME 'need Emacs_Mule encoding', exception: NameError, message: /uninitialized constant Encoding::Emacs_Mule/ do Encoding.default_internal = Encoding::Emacs_Mule str = [0x80].pack('C').force_encoding Encoding::BINARY -> { str.send(@method) }.should raise_error(Encoding::ConverterNotFoundError) @@ -50,7 +50,7 @@ end it "transcodes Japanese multibyte characters" do - NATFIXME 'not sure' do + NATFIXME 'need ISO_2022_JP encoding', exception: NameError, message: /uninitialized constant Encoding::ISO_2022_JP/ do str = "あいうえお" str.send(@method, Encoding::ISO_2022_JP).should == "\e\x24\x42\x24\x22\x24\x24\x24\x26\x24\x28\x24\x2A\e\x28\x42".force_encoding(Encoding::ISO_2022_JP) @@ -58,7 +58,7 @@ end it "transcodes a 7-bit String despite no generic converting being available" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do -> do Encoding::Converter.new Encoding::Emacs_Mule, Encoding::BINARY end.should raise_error(Encoding::ConverterNotFoundError) @@ -69,7 +69,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do str = [0x80].pack('C').force_encoding Encoding::BINARY -> do str.send(@method, Encoding::Emacs_Mule) @@ -78,7 +78,7 @@ end it "raises an Encoding::ConverterNotFoundError for an invalid encoding" do - NATFIXME 'converter not found' do + NATFIXME 'wrong error', exception: SpecFailedException, message: /ConverterNotFoundError.*but instead.*ArgumentError/ do -> do "abc".send(@method, "xyz") end.should raise_error(Encoding::ConverterNotFoundError) @@ -88,20 +88,16 @@ describe "when passed options" do it "does not process transcoding options if not transcoding" do - NATFIXME 'encode options' do - result = "あ\ufffdあ".send(@method, undef: :replace) - result.should == "あ\ufffdあ" - end + result = "あ\ufffdあ".send(@method, undef: :replace) + result.should == "あ\ufffdあ" end it "calls #to_hash to convert the object" do - NATFIXME 'encode options' do - options = mock("string encode options") - options.should_receive(:to_hash).and_return({ undef: :replace }) + options = mock("string encode options") + options.should_receive(:to_hash).and_return({ undef: :replace }) - result = "あ\ufffdあ".send(@method, **options) - result.should == "あ\ufffdあ" - end + result = "あ\ufffdあ".send(@method, **options) + result.should == "あ\ufffdあ" end it "transcodes to Encoding.default_internal when set" do @@ -111,7 +107,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible despite 'invalid: :replace, undef: :replace'" do - NATFIXME 'encode options' do + NATFIXME 'need Emacs_Mule encoding', exception: NameError, message: /uninitialized constant Encoding::Emacs_Mule/ do Encoding.default_internal = Encoding::Emacs_Mule str = [0x80].pack('C').force_encoding Encoding::BINARY -> do @@ -121,7 +117,7 @@ end it "replaces invalid characters when replacing Emacs-Mule encoded strings" do - NATFIXME 'encode options' do + NATFIXME 'need Emacs_Mule encoding', exception: ArgumentError, message: 'unknown encoding name - "Emacs-Mule"' do got = [0x80].pack('C').force_encoding('Emacs-Mule').send(@method, invalid: :replace) got.should == "?".encode('Emacs-Mule') @@ -131,7 +127,7 @@ describe "when passed to, from" do it "transcodes between the encodings ignoring the String encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do str = "あ" result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') result.force_encoding Encoding::EUC_JP @@ -140,7 +136,7 @@ end it "calls #to_str to convert the from object to an Encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do enc = mock("string encode encoding") enc.should_receive(:to_str).and_return("ibm437") @@ -155,36 +151,30 @@ describe "when passed to, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'encode options' do - result = "あ?あ".send(@method, Encoding::EUC_JP, undef: :replace) - # testing for: "\xA4\xA2?\xA4\xA2" - xA4xA2 = [0xA4, 0xA2].pack('CC') - result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") - end + result = "あ?あ".send(@method, Encoding::EUC_JP, undef: :replace) + # testing for: "\xA4\xA2?\xA4\xA2" + xA4xA2 = [0xA4, 0xA2].pack('CC') + result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") end it "replaces invalid characters in the destination encoding" do - NATFIXME 'encode options' do - xFF = [0xFF].pack('C').force_encoding('utf-8') - "ab#{xFF}c".send(@method, Encoding::ISO_8859_1, invalid: :replace).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + "ab#{xFF}c".send(@method, Encoding::ISO_8859_1, invalid: :replace).should == "ab?c" end it "calls #to_hash to convert the options object" do - NATFIXME 'encode options' do - options = mock("string encode options") - options.should_receive(:to_hash).and_return({ undef: :replace }) + options = mock("string encode options") + options.should_receive(:to_hash).and_return({ undef: :replace }) - result = "あ?あ".send(@method, Encoding::EUC_JP, **options) - xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') - result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") - end + result = "あ?あ".send(@method, Encoding::EUC_JP, **options) + xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') + result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") end end describe "when passed to, from, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to EUC-JP/ do str = "あ?あ".force_encoding Encoding::BINARY result = str.send(@method, "euc-jp", "utf-8", undef: :replace) xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') @@ -193,7 +183,7 @@ end it "replaces invalid characters in the destination encoding" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do xFF = [0xFF].pack('C').force_encoding('utf-8') str = "ab#{xFF}c".force_encoding Encoding::BINARY str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c" @@ -201,7 +191,7 @@ end it "calls #to_str to convert the to object to an encoding" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do to = mock("string encode to encoding") to.should_receive(:to_str).and_return("iso-8859-1") @@ -212,7 +202,7 @@ end it "calls #to_str to convert the from object to an encoding" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do from = mock("string encode to encoding") from.should_receive(:to_str).and_return("utf-8") @@ -223,7 +213,7 @@ end it "calls #to_hash to convert the options object" do - NATFIXME 'encode options' do + NATFIXME 'keyword splat should call to_hash?' do options = mock("string encode options") options.should_receive(:to_hash).and_return({ invalid: :replace }) @@ -236,31 +226,31 @@ describe "given the xml: :text option" do it "replaces all instances of '&' with '&'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '& and &'.send(@method, "UTF-8", xml: :text).should == '& and &' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '< and <'.send(@method, "UTF-8", xml: :text).should == '< and <' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '> and >'.send(@method, "UTF-8", xml: :text).should == '> and >' end end it "does not replace '\"'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '" and "'.send(@method, "UTF-8", xml: :text).should == '" and "' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :text).should == 'ürst' end end @@ -268,37 +258,37 @@ describe "given the xml: :attr option" do it "surrounds the encoded text with double-quotes" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'abc'.send(@method, "UTF-8", xml: :attr).should == '"abc"' end end it "replaces all instances of '&' with '&'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '& and &'.send(@method, "UTF-8", xml: :attr).should == '"& and &"' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '< and <'.send(@method, "UTF-8", xml: :attr).should == '"< and <"' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '> and >'.send(@method, "UTF-8", xml: :attr).should == '"> and >"' end end it "replaces all instances of '\"' with '"'" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '" and "'.send(@method, "UTF-8", xml: :attr).should == '"" and ""' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'encode options' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :attr).should == '"ürst"' end end diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 8faa03b52..9c2439649 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -52,19 +52,36 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje break; } + auto handle_fallback = [&](nat_int_t cpt) { + Value result; + if (options.fallback_option->respond_to(env, "[]"_s)) { + result = options.fallback_option->send(env, "[]"_s, { Value::integer(cpt) }); + } else if (options.fallback_option->respond_to(env, "call"_s)) { + result = options.fallback_option->send(env, "call"_s, { Value::integer(cpt) }); + } + temp_string.append(result->to_s(env)); + }; + auto source_codepoint = valid ? c : -1; nat_int_t unicode_codepoint; if (source_codepoint == -1) { switch (options.invalid_option) { case EncodeInvalidOption::Raise: + if (options.fallback_option) { + handle_fallback(c); + continue; + } env->raise_invalid_byte_sequence_error(this); case EncodeInvalidOption::Replace: if (options.replace_option) { temp_string.append(options.replace_option); continue; } - unicode_codepoint = 0xFFFD; + if (is_single_byte_encoding()) + unicode_codepoint = '?'; + else + unicode_codepoint = 0xFFFD; } } else { unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint); @@ -73,7 +90,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje // handle error if (unicode_codepoint < 0) { StringObject *message; - if (num() != Encoding::UTF_8) { // Example: "\x8F" to UTF-8 in conversion from ASCII-8BIT to UTF-8 to US-ASCII message = StringObject::format( @@ -87,7 +103,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje String::hex(source_codepoint, String::HexFormat::Uppercase), orig_encoding->name()); } - env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); } @@ -95,24 +110,38 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje // handle error if (destination_codepoint < 0) { - StringObject *message; - - if (orig_encoding->num() != Encoding::UTF_8) - message = StringObject::format( - // Example: "U+043F to WINDOWS-1252 in conversion from Windows-1251 to UTF-8 to WINDOWS-1252", - "U+{} to {} in conversion from {} to UTF-8 to {}", - String::hex(source_codepoint, String::HexFormat::Uppercase), - name(), - orig_encoding->name(), - name()); - else { - // Example: U+0439 from UTF-8 to ASCII-8BIT - auto hex = String(); - hex.append_sprintf("%04X", source_codepoint); - message = StringObject::format("U+{} from UTF-8 to {}", hex, name()); + switch (options.undef_option) { + case EncodeUndefOption::Raise: + if (options.fallback_option) { + handle_fallback(unicode_codepoint); + continue; + } + StringObject *message; + if (orig_encoding->num() != Encoding::UTF_8) + message = StringObject::format( + // Example: "U+043F to WINDOWS-1252 in conversion from Windows-1251 to UTF-8 to WINDOWS-1252", + "U+{} to {} in conversion from {} to UTF-8 to {}", + String::hex(source_codepoint, String::HexFormat::Uppercase), + name(), + orig_encoding->name(), + name()); + else { + // Example: U+0439 from UTF-8 to ASCII-8BIT + auto hex = String(); + hex.append_sprintf("%04X", source_codepoint); + message = StringObject::format("U+{} from UTF-8 to {}", hex, name()); + } + env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); + case EncodeUndefOption::Replace: + if (options.replace_option) { + temp_string.append(options.replace_option); + continue; + } + if (is_single_byte_encoding()) + destination_codepoint = '?'; + else + destination_codepoint = 0xFFFD; } - - env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); } auto destination_char_obj = encode_codepoint(destination_codepoint); diff --git a/src/string_object.cpp b/src/string_object.cpp index 093184788..971c7b58b 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -1154,9 +1154,21 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco options.invalid_option = EncodeInvalidOption::Replace; } + auto undef = kwargs->remove(env, "undef"_s); + if (undef) { + if (undef->is_nil()) + options.undef_option = EncodeUndefOption::Raise; + else if (undef == "replace"_s) + options.undef_option = EncodeUndefOption::Replace; + } + auto replace = kwargs->remove(env, "replace"_s); if (replace && !replace->is_nil()) options.replace_option = replace->as_string_or_raise(env)->encode(env, dst_encoding)->as_string_or_raise(env); + + auto fallback = kwargs->remove(env, "fallback"_s); + if (fallback && !fallback->is_nil()) + options.fallback_option = fallback; } env->ensure_no_extra_keywords(kwargs);