[GR-18163] Fix rb_enc_left_char_head() (#3267)

PullRequest: truffleruby/4013
oracle · Sep 20, 2023 · c77f8bb · c77f8bb
2 parents 9f8ff77 + 4d17ba4
commit c77f8bb
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ New features:
 
 Bug fixes:
 
+* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
 
 Compatibility:
 

diff --git a/lib/cext/ABI_check.txt b/lib/cext/ABI_check.txt
@@ -1 +1 @@
-4
+5
diff --git a/spec/ruby/optional/capi/encoding_spec.rb b/spec/ruby/optional/capi/encoding_spec.rb
@@ -674,6 +674,22 @@
     end
   end
 
+  describe "rb_enc_left_char_head" do
+    it 'returns the head position of a character' do
+      @s.rb_enc_left_char_head("é", 1).should == 0
+      @s.rb_enc_left_char_head("éééé", 7).should == 6
+
+      @s.rb_enc_left_char_head("a", 0).should == 0
+
+      # unclear if this is intended to work
+      @s.rb_enc_left_char_head("a", 1).should == 1
+
+      # Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer
+      @s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88
+      @s.rb_enc_left_char_head("a".b, 88).should == 88
+    end
+  end
+
   describe "ONIGENC_MBC_CASE_FOLD" do
     it "returns the correct case fold for the given string" do
       @s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1]

diff --git a/spec/ruby/optional/capi/ext/encoding_spec.c b/spec/ruby/optional/capi/ext/encoding_spec.c
@@ -307,6 +307,12 @@ static VALUE encoding_spec_rb_enc_strlen(VALUE self, VALUE str, VALUE length, VA
   return LONG2FIX(rb_enc_strlen(p, e, rb_to_encoding(encoding)));
 }
 
+static VALUE encoding_spec_rb_enc_left_char_head(VALUE self, VALUE str, VALUE offset) {
+  char *ptr = RSTRING_PTR(str);
+  char *result = rb_enc_left_char_head(ptr, ptr + NUM2INT(offset), RSTRING_END(str), rb_enc_get(str));
+  return LONG2NUM(result - ptr);
+}
+
 void Init_encoding_spec(void) {
   VALUE cls;
   native_rb_encoding_pointer = (rb_encoding**) malloc(sizeof(rb_encoding*));
@@ -364,6 +370,7 @@ void Init_encoding_spec(void) {
   rb_define_method(cls, "rb_enc_str_asciionly_p", encoding_spec_rb_enc_str_asciionly_p, 1);
   rb_define_method(cls, "rb_uv_to_utf8", encoding_spec_rb_uv_to_utf8, 2);
   rb_define_method(cls, "ONIGENC_MBC_CASE_FOLD", encoding_spec_ONIGENC_MBC_CASE_FOLD, 1);
+  rb_define_method(cls, "rb_enc_left_char_head", encoding_spec_rb_enc_left_char_head, 2);
 }
 
 #ifdef __cplusplus

diff --git a/src/main/c/cext/encoding.c b/src/main/c/cext/encoding.c
@@ -226,7 +226,10 @@ int rb_enc_get_index(VALUE obj) {
 }
 
 char* rb_enc_left_char_head(const char *start, const char *p, const char *end, rb_encoding *enc) {
-  int length = start - end;
+  if (p <= start || p >= end) {
+    return p;
+  }
+  int length = end - start;
   int position = polyglot_as_i32(polyglot_invoke(RUBY_CEXT, "rb_enc_left_char_head",
       rb_tr_unwrap(rb_enc_from_encoding(enc)),
       rb_tr_unwrap(rb_str_new(start, length)),
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,6 +5,7 @@ New features:

		Bug fixes:

		* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).

		Compatibility:

Expand Down