Skip to content

Commit ea0786b

Browse files
authored
Re-port new integer-scanning utility methods (#127)
GitHub: fix GH-125 This is a re-port of the C code from @byroot for fast base 10 and base 16 integer scanning. In #125, @kou pointed out there's an intermittent failure in the JRuby extension. We were unable to confirm exactly the circumstances that cause that failure, but this re-port should at least help reduce the change it is a bug in the original Java code.
1 parent 9bee37e commit ea0786b

2 files changed

Lines changed: 57 additions & 46 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ on:
44
- push
55
- pull_request
66

7+
env:
8+
JRUBY_OPTS: "-X+C" # temporarily force JRuby to compile, so Java exception trace will contain .rb lines
9+
710
jobs:
811
ruby-versions:
912
uses: ruby/actions/.github/workflows/ruby_versions.yml@master

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.jruby.RubyClass;
4040
import org.jruby.RubyFixnum;
4141
import org.jruby.RubyHash;
42+
import org.jruby.RubyInteger;
4243
import org.jruby.RubyMatchData;
4344
import org.jruby.RubyNumeric;
4445
import org.jruby.RubyObject;
@@ -563,39 +564,34 @@ public IRubyObject scan_base10_integer(ThreadContext context) {
563564
check(context);
564565
clearMatched();
565566

566-
if (!str.getEncoding().isAsciiCompatible()) {
567-
throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
568-
}
569-
567+
strscanMustAsciiCompat(runtime);
570568

571569
ByteList bytes = str.getByteList();
572-
int curr = this.curr;
570+
int ptr = curr;
571+
int len = 0;
573572

574-
int bite = bytes.get(curr);
575-
if (bite == '-' || bite == '+') {
576-
curr++;
577-
bite = bytes.get(curr);
578-
}
573+
int remaining_len = bytes.realSize() - curr;
579574

580-
if (!(bite >= '0' && bite <= '9')) {
575+
if (remaining_len <= 0) {
581576
return context.nil;
582577
}
583578

584-
while (bite >= '0' && bite <= '9') {
585-
curr++;
586-
if (curr >= bytes.getRealSize()) {
587-
break;
588-
}
589-
bite = bytes.get(curr);
579+
if (bytes.get(ptr + len) == '-' || bytes.get(ptr + len) == '+') {
580+
len++;
581+
}
582+
583+
if (!Character.isDigit(bytes.get(ptr + len))) {
584+
return context.nil;
590585
}
591586

592-
int length = curr - this.curr;
593-
prev = this.curr;
594-
this.curr = curr;
595587
setMatched();
596-
adjustRegisters();
588+
prev = ptr;
597589

598-
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
590+
while (len < remaining_len && Character.isDigit(bytes.get(ptr + len))) {
591+
len++;
592+
}
593+
594+
return strscanParseInteger(runtime, bytes, ptr, len, 10);
599595
}
600596

601597
@JRubyMethod(name = "scan_base16_integer", visibility = PRIVATE)
@@ -604,44 +600,56 @@ public IRubyObject scan_base16_integer(ThreadContext context) {
604600
check(context);
605601
clearMatched();
606602

607-
if (!str.getEncoding().isAsciiCompatible()) {
608-
throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
609-
}
610-
603+
strscanMustAsciiCompat(runtime);
611604

612605
ByteList bytes = str.getByteList();
613-
int curr = this.curr;
606+
int ptr = this.curr;
614607

615-
int bite = bytes.get(curr);
616-
if (bite == '-' || bite == '+') {
617-
curr++;
618-
bite = bytes.get(curr);
608+
int remaining_len = bytes.realSize() - ptr;
609+
610+
if (remaining_len <= 0) {
611+
return context.nil;
619612
}
620613

621-
if (bite == '0' && bytes.get(curr + 1) == 'x') {
622-
curr += 2;
623-
bite = bytes.get(curr);
614+
int len = 0;
615+
616+
if (bytes.get(ptr + len) == '-' || bytes.get(ptr + len) == '+') {
617+
len++;
624618
}
625619

626-
if (!((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F'))) {
627-
return context.nil;
620+
if ((remaining_len >= (len + 2)) && bytes.get(ptr + len) == '0' && bytes.get(ptr + len + 1) == 'x') {
621+
len += 2;
628622
}
629623

630-
while ((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F')) {
631-
curr++;
632-
if (curr >= bytes.getRealSize()) {
633-
break;
634-
}
635-
bite = bytes.get(curr);
624+
if (len >= remaining_len || !isHexChar(bytes.get(ptr + len))) {
625+
return context.nil;
636626
}
637627

638-
int length = curr - this.curr;
639-
prev = this.curr;
640-
this.curr = curr;
641628
setMatched();
642629
adjustRegisters();
630+
prev = ptr;
631+
632+
while (len < remaining_len && isHexChar(bytes.get(ptr + len))) {
633+
len++;
634+
}
635+
636+
return strscanParseInteger(runtime, bytes, ptr, len, 16);
637+
}
638+
639+
private RubyInteger strscanParseInteger(Ruby runtime, ByteList bytes, int ptr, int len, int base) {
640+
this.curr = ptr + len;
641+
642+
return ConvertBytes.byteListToInum(runtime, bytes, ptr, len, base, true);
643+
}
644+
645+
private void strscanMustAsciiCompat(Ruby runtime) {
646+
if (!str.getEncoding().isAsciiCompatible()) {
647+
throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
648+
}
649+
}
643650

644-
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 16, true);
651+
private static boolean isHexChar(int c) {
652+
return Character.isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
645653
}
646654

647655
@JRubyMethod(name = "unscan")

0 commit comments

Comments
 (0)