Skip to content

Commit

Permalink
Merge pull request #2139 from natalie-lang/string-encode-fallback
Browse files Browse the repository at this point in the history
Improve String#encode :fallback option
  • Loading branch information
seven1m committed Jun 25, 2024
2 parents 9a96003 + f2dd21a commit d8da0dd
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 9 deletions.
184 changes: 184 additions & 0 deletions spec/core/string/shared/encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,190 @@
end
end

describe "given the fallback option" do
context "given a hash" do
it "looks up the replacement value from the hash" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "bar" })
encoded.should == "Bbar"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the key is not present in the hash" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "foo" => "bar" })
}.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII")
end

it "raises an error if the value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end

it "uses the hash's default value if set" do
hash = {}
hash.default = "bar"
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash)
encoded.should == "Bbar"
end

it "uses the result of calling default_proc if set" do
hash = {}
hash.default_proc = -> _, _ { "bar" }
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash)
encoded.should == "Bbar"
end
end

context "given an object inheriting from Hash" do
before do
klass = Class.new(Hash)
@hash_like = klass.new
@hash_like["\ufffd"] = "bar"
end

it "looks up the replacement value from the object" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like)
encoded.should == "Bbar"
end
end

context "given an object responding to []" do
before do
klass = Class.new do
def [](c) = c.bytes.inspect
end
@hash_like = klass.new
end

it "calls [] on the object, passing the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like)
encoded.should == "B[239, 191, 189]"
end
end

context "given an object not responding to []" do
before do
@non_hash_like = Object.new
end

it "raises an error" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: @non_hash_like)
}.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII")
end
end

context "given a proc" do
it "calls the proc to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| c.bytes.inspect })
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end
end

context "given a lambda" do
it "calls the lambda to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { c.bytes.inspect })
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end
end

context "given a method" do
def replace(c) = c.bytes.inspect
def replace_bad(c) = "\uffee"

def replace_to_str(c)
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
obj
end

def replace_to_s(c)
obj = Object.new
obj.should_not_receive(:to_s)
obj
end

it "calls the method to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace))
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_str))
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_s))
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_bad))
}.should raise_error(ArgumentError, "too big fallback string")
end
end
end

describe "given the xml: :text option" do
it "replaces all instances of '&' with '&'" do
NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do
Expand Down
34 changes: 25 additions & 9 deletions src/encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,34 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
}

auto handle_fallback = [&](nat_int_t cpt) {
Value result;
auto ch = new StringObject { orig_encoding->encode_codepoint(cpt) };
Value result = NilObject::the();
if (options.fallback_option->respond_to(env, "[]"_s)) {
result = options.fallback_option->send(env, "[]"_s, { Value::integer(cpt) });
result = options.fallback_option->send(env, "[]"_s, { ch });
} else if (options.fallback_option->respond_to(env, "call"_s)) {
result = options.fallback_option->send(env, "call"_s, { Value::integer(cpt) });
result = options.fallback_option->send(env, "call"_s, { ch });
}
temp_string.append(result->to_s(env));

if (result->is_nil()) {
auto message = StringObject::format(
"U+{} from {} to {}",
String::hex(cpt, String::HexFormat::Uppercase),
orig_encoding->name(),
name());
env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message);
}

auto result_str = result->to_str(env);
if (result_str->encoding()->num() != num()) {
try {
result_str = result_str->encode(env, EncodingObject::get(num()))->to_str(env);
} catch (ExceptionObject *e) {
// FIXME: I'm not sure how Ruby knows the character is "too big" for the target encoding.
// We don't have that kind of information yet.
env->raise("ArgumentError", "too big fallback string");
}
}
temp_string.append(result_str);
};

auto source_codepoint = valid ? c : -1;
Expand All @@ -68,10 +89,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
if (source_codepoint == -1) {
switch (options.invalid_option) {
case EncodeInvalidOption::Raise:
if (options.fallback_option) {
handle_fallback(c);
continue;
}
env->raise_invalid_byte_sequence_error(this);
case EncodeInvalidOption::Replace:
if (options.replace_option) {
Expand All @@ -87,7 +104,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint);
}

// handle error
if (unicode_codepoint < 0) {
StringObject *message;
if (num() != Encoding::UTF_8) {
Expand Down

0 comments on commit d8da0dd

Please sign in to comment.