Index: src/org/jruby/evaluator/EvaluationState.java =================================================================== --- src/org/jruby/evaluator/EvaluationState.java (revision 3765) +++ src/org/jruby/evaluator/EvaluationState.java (working copy) @@ -1496,10 +1496,8 @@ lang = "u"; } try { - return RubyRegexp.newRegexp(runtime, iVisited.getValue().toString(), iVisited.getPattern(), iVisited.getFlags(), lang); + return RubyRegexp.newRegexp(runtime, iVisited.getValue(), iVisited.getPattern(), iVisited.getFlags(), lang); } catch(jregex.PatternSyntaxException e) { - // System.err.println(iVisited.getValue().toString()); - // e.printStackTrace(); throw runtime.newRegexpError(e.getMessage()); } } Index: src/org/jruby/RubyRegexp.java =================================================================== --- src/org/jruby/RubyRegexp.java (revision 3765) +++ src/org/jruby/RubyRegexp.java (working copy) @@ -72,7 +72,7 @@ * Warning: THIS IS NOT REALLY SUPPORTED BY JRUBY. */ - private String source; + private ByteList source; private Pattern pattern; private KCode code; private int flags; @@ -177,16 +177,22 @@ return super.callMethod(context, rubyclass, name, args, callType, block); } } + + public void initialize(ByteList regex, int options) { + try { + source = regex; + pattern = REGEXP_TRANSLATOR.translate(regex, options, code.flags()); + flags = REGEXP_TRANSLATOR.flagsFor(options, code.flags()); + } catch(jregex.PatternSyntaxException e) { + // System.err.println(regex); + // e.printStackTrace(); + throw getRuntime().newRegexpError(e.getMessage()); + } + } public void initialize(String regex, int options) { try { - if(getCode() == KCode.UTF8) { - try { - regex = new String(ByteList.plain(regex),"UTF8"); - } catch(Exception e) { - } - } - source = regex; + source = ByteList.create(regex); pattern = REGEXP_TRANSLATOR.translate(regex, options, code.flags()); flags = REGEXP_TRANSLATOR.flagsFor(options, code.flags()); } catch(jregex.PatternSyntaxException e) { @@ -223,12 +229,21 @@ // Methods of the Regexp class (rb_reg_*): public static RubyRegexp newRegexp(RubyString str, int options, String lang) { - return newRegexp(str.getRuntime(), str.toString(), options, lang); + return newRegexp(str.getRuntime(), str.getByteList(), options, lang); } public static RubyRegexp newRegexp(Ruby runtime, String source, Pattern pattern, int flags, String lang) { RubyRegexp re = new RubyRegexp(runtime); re.code = KCode.create(runtime, lang); + re.source = ByteList.create(source); + re.pattern = pattern; + re.flags = flags; + return re; + } + + public static RubyRegexp newRegexp(Ruby runtime, ByteList source, Pattern pattern, int flags, String lang) { + RubyRegexp re = new RubyRegexp(runtime); + re.code = KCode.create(runtime, lang); re.source = source; re.pattern = pattern; re.flags = flags; @@ -242,6 +257,13 @@ return re; } + public static RubyRegexp newRegexp(Ruby runtime, ByteList str, int options, String kcode) { + RubyRegexp re = new RubyRegexp(runtime); + re.code = KCode.create(runtime, kcode); + re.initialize(str, options); + return re; + } + public static RubyRegexp newInstance(IRubyObject recv, IRubyObject[] args) { RubyClass klass = (RubyClass)recv; @@ -253,10 +275,10 @@ } public IRubyObject initialize(IRubyObject[] args) { - String pat = + ByteList pat = (args[0] instanceof RubyRegexp) - ? ((RubyRegexp) args[0]).source().toString() - : RubyString.stringValue(args[0]).toString(); + ? ((RubyRegexp) args[0]).source().getByteList() + : RubyString.stringValue(args[0]).getByteList(); int opts = 0; if (args.length > 1) { if (args[1] instanceof RubyFixnum) { @@ -392,6 +414,7 @@ if (target instanceof RubySymbol || target instanceof RubyHash || target instanceof RubyArray) { return getRuntime().getFalse(); } + // FIXME: make Unicode-aware RubyString ss = RubyString.stringValue(target); String string = ss.toString(); if (string.length() == 0 && "^$".equals(pattern.toString())) { @@ -552,6 +575,9 @@ String t = target; if(utf8) { try { + byte[] bs = ByteList.plain(target); + String string = new String(bs, 0, startPos, "UTF8"); + startPos = string.length(); t = new String(ByteList.plain(target),"UTF8"); } catch(Exception e) { } @@ -691,8 +717,8 @@ * */ public IRubyObject inspect() { - final String regex = source; - final int length = regex.length(); + final ByteList regex = source; + final int length = regex.length(); StringBuffer sb = new StringBuffer(length + 2); sb.append('/'); @@ -827,7 +853,7 @@ public static RubyRegexp unmarshalFrom(UnmarshalStream input) throws java.io.IOException { RubyRegexp result = newRegexp(input.getRuntime(), - RubyString.byteListToString(input.unmarshalString()), input.unmarshalInt(), null); + input.unmarshalString(), input.unmarshalInt(), null); input.registerLinkTarget(result); return result; } Index: src/org/jruby/RegexpTranslator.java =================================================================== --- src/org/jruby/RegexpTranslator.java (revision 3765) +++ src/org/jruby/RegexpTranslator.java (working copy) @@ -1,5 +1,5 @@ /** - * + * */ package org.jruby; @@ -11,91 +11,110 @@ import jregex.TextBuffer; import org.jruby.parser.ReOptions; +import org.jruby.util.ByteList; public class RegexpTranslator { - + private static final Pattern SHARP_IN_CHARACTER_CLASS_PATTERN = new Pattern("(\\[[^]]*)#(.*?])"); - private static final Pattern SPACE_IN_CHARACTER_CLASS_PATTERN = new Pattern("(\\[[^]]*) (.*?])"); - private static final Pattern COMMENT_PATTERN = new Pattern("\\(\\?#[^)]*\\)"); - private static final Pattern COMMENT2_PATTERN = new Pattern("(? 0) { flags |= REFlags.IGNORE_CASE; } if ((options & ReOptions.RE_OPTION_EXTENDED) > 0) { - flags |= REFlags.IGNORE_SPACES; + flags |= REFlags.IGNORE_SPACES; } if ((options & ReOptions.RE_OPTION_MULTILINE) > 0) { - flags |= REFlags.DOTALL; + flags |= REFlags.DOTALL; } - return flags; - } + // FIXME: This may be useful for something, but doesn't appear to be right + // for Ruby. It turns \w, \s, etc into Unicode forms, but that appears to + // break some test cases for us + //if ((options & ReOptions.RE_UNICODE) > 0) { + // flags |= REFlags.UNICODE; + //} + return flags; + } } Index: src/org/jruby/RubyString.java =================================================================== --- src/org/jruby/RubyString.java (revision 3765) +++ src/org/jruby/RubyString.java (working copy) @@ -1796,9 +1796,20 @@ } if (repl.isTaint()) tainted = true; - int startZ = mat.start(0); + try { + startZ = str.substring(0, startZ).getBytes("UTF8").length; + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } int plen = mat.end(0) - startZ; + try { + plen = mat.group(0).getBytes("UTF8").length; + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } ByteList replValue = ((RubyString)repl).value; if (replValue.realSize > plen) { // this might be smarter by being real bytes length aware @@ -1977,22 +1988,33 @@ //FIXME may be a problem with pos when doing reverse searches int pos = !reverse ? 0 : value.length(); + boolean offset = false; if (Arity.checkArgumentCount(getRuntime(), args, 1, 2) == 2) { pos = RubyNumeric.fix2int(args[1]); + offset = true; } if (pos < 0) { pos += value.length(); if (pos < 0) return getRuntime().getNil(); } + if (args[0] instanceof RubyRegexp) { + // save position we shouldn't look past int doNotLookPastIfReverse = pos; // RubyRegexp doesn't (yet?) support reverse searches, so we // find all matches and use the last one--very inefficient. - // XXX - find a better way + // FIXME: - find a better way pos = ((RubyRegexp) args[0]).search(toString(), this, reverse ? 0 : pos); - + if (pos == -1) return getRuntime().getNil(); int dummy = pos; + if (offset) { + pos = doNotLookPastIfReverse; + if (dummy > pos) { + pos = -1; + dummy = -1; + } + } while (reverse && dummy > -1 && dummy <= doNotLookPastIfReverse) { pos = dummy; dummy = ((RubyRegexp) args[0]).search(toString(), this, pos + 1); @@ -2369,6 +2391,7 @@ if (((RubyString)spat).value.get(0) == ' ') { awkSplit = true; } else { + // FIXME: Shouldn't this be unicode-aware? String stringPattern = RubyString.stringValue(spat).toString(); spat = RubyRegexp.newRegexp(runtime, RubyRegexp.escapeSpecialChars(stringPattern), 0, null); } @@ -2417,6 +2440,7 @@ boolean utf8 = false; String str; + RubyRegexp rr =(RubyRegexp)spat; if (runtime.getKCode() == KCode.UTF8) { // We're in UTF8 mode; try to convert the string to UTF8, but fall back on raw bytes if we can't decode // TODO: all this decoder and charset stuff could be centralized...in KCode perhaps? @@ -2431,11 +2455,12 @@ // ignore, just use the unencoded string str = toString(); } - } else { - str = toString(); + } else { + utf8 = rr.getCode() == KCode.UTF8; + str = toString(utf8); } - Pattern pat = ((RubyRegexp)spat).getPattern(); + Pattern pat = rr.getPattern(); Matcher mat = pat.matcher(str); beg = 0; boolean lastNull = false; Index: src/org/jruby/parser/ReOptions.java =================================================================== --- src/org/jruby/parser/ReOptions.java (revision 3765) +++ src/org/jruby/parser/ReOptions.java (working copy) @@ -31,11 +31,12 @@ public interface ReOptions { int RE_OPTION_IGNORECASE = 1; - int RE_OPTION_EXTENDED = (RE_OPTION_IGNORECASE << 1); - int RE_OPTION_MULTILINE = (RE_OPTION_EXTENDED << 1); - int RE_OPTION_SINGLELINE = (RE_OPTION_MULTILINE << 1); + int RE_OPTION_EXTENDED = 2; + int RE_OPTION_MULTILINE = 4; + int RE_OPTION_SINGLELINE = 8; int RE_OPTION_POSIXLINE = (RE_OPTION_MULTILINE | RE_OPTION_SINGLELINE); - int RE_OPTION_LONGEST = (RE_OPTION_SINGLELINE << 1); - int RE_MAY_IGNORECASE = (RE_OPTION_LONGEST << 1); + int RE_OPTION_LONGEST = 16; + int RE_MAY_IGNORECASE = 32; + int RE_UNICODE = 64; int RE_OPTION_ONCE = 0x80; // odd...but it is odd in ruby too. } Index: src/org/jruby/RubyMatchData.java =================================================================== --- src/org/jruby/RubyMatchData.java (revision 3765) +++ src/org/jruby/RubyMatchData.java (working copy) @@ -33,6 +33,8 @@ ***** END LICENSE BLOCK *****/ package org.jruby; +import java.io.UnsupportedEncodingException; + import jregex.Matcher; import org.jruby.runtime.Arity; import org.jruby.runtime.CallbackFactory; @@ -282,7 +284,9 @@ // JRUBY-97, but at the same time the testcase remained very slow // The additional minor optimizations to RubyString as part of the fix // dramatically improve the performance. - return getRuntime().newString(matcher.group((int)n)); + + return RubyString.newUnicodeString(getRuntime(), matcher.group((int)n)); +// return getRuntime().newString(matcher.group((int)n)); } public RubyString pre_match() { @@ -319,6 +323,17 @@ public IRubyObject doClone() { return new JavaString(getRuntime(), original, matcher); } + + public int matchStartPosition() { + int position = 0; + try { + position = matcher.prefix().getBytes("UTF8").length; + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return position; + } } public static final class RString extends RubyMatchData { Index: src/org/jruby/util/ByteList.java =================================================================== --- src/org/jruby/util/ByteList.java (revision 3765) +++ src/org/jruby/util/ByteList.java (working copy) @@ -31,6 +31,7 @@ package org.jruby.util; import java.io.Serializable; +import java.io.UnsupportedEncodingException; /** @@ -515,11 +516,30 @@ /** * Remembers toString value, which is expensive for StringBuffer. + * + * @return an ISO-8859-1 representation of the byte list */ public String toString() { - if (stringValue == null) stringValue = new String(plain(bytes, begin, realSize)); - return stringValue; + try { + if (stringValue == null) stringValue = toString("ISO-8859-1"); + return stringValue; + } catch (UnsupportedEncodingException uee) { + throw new RuntimeException("ISO-8859-1 encoding should never fail; report this at www.jruby.org"); + } + } + + public String toUtf8String() { + // TODO: no caching? :( + try { + return toString("UTF-8"); + } catch (UnsupportedEncodingException uee) { + throw new RuntimeException("UTF-8 encoding should never fail; report this at www.jruby.org"); + } } + + public String toString(String encoding) throws UnsupportedEncodingException { + return new String(this.bytes, begin, realSize, encoding); + } public static ByteList create(CharSequence s) { return new ByteList(plain(s),false); Index: src/org/jruby/compiler/impl/StandardASMCompiler.java =================================================================== --- src/org/jruby/compiler/impl/StandardASMCompiler.java (revision 3765) +++ src/org/jruby/compiler/impl/StandardASMCompiler.java (working copy) @@ -65,6 +65,7 @@ import org.jruby.internal.runtime.GlobalVariables; import org.jruby.javasupport.util.CompilerHelpers; import org.jruby.lexer.yacc.ISourcePosition; +import org.jruby.parser.ReOptions; import org.jruby.parser.StaticScope; import org.jruby.runtime.Arity; import org.jruby.runtime.Block; @@ -1225,23 +1226,6 @@ invokeIRubyObject("getMetaClass", cg.sig(RubyClass.class)); } - private void getCRef() { - loadThreadContext(); - // FIXME: This doesn't seem *quite* right. If actually within a class...end, is self.getMetaClass the correct class? should be self, no? - invokeThreadContext("peekCRef", cg.sig(SinglyLinkedList.class)); - } - - private void newTypeError(String error) { - loadRuntime(); - getMethodAdapter().ldc(error); - invokeIRuby("newTypeError", cg.sig(RaiseException.class, cg.params(String.class))); - } - - private void getCurrentVisibility() { - loadThreadContext(); - invokeThreadContext("getCurrentVisibility", cg.sig(Visibility.class)); - } - private void println() { SkinnyMethodAdapter mv = getMethodAdapter(); @@ -1627,7 +1611,13 @@ loadRuntime(); // load string, for Regexp#source and Regexp#inspect - mv.ldc(value.toString()); + String regexpString = null; + if ((options & ReOptions.RE_UNICODE) > 0) { + regexpString = value.toUtf8String(); + } else { + regexpString = value.toString(); + } + mv.ldc(regexpString); // in current method, load the field to see if we've created a Pattern yet @@ -1642,7 +1632,7 @@ mv.visitFieldInsn(PUTSTATIC, classname, name_flags, cg.ci(Integer.TYPE)); loadRuntime(); - mv.ldc(value.toString()); + mv.ldc(regexpString); mv.ldc(new Integer(options)); invokeUtilityMethod("regexpLiteral",cg.sig(Pattern.class,cg.params(Ruby.class,String.class,Integer.TYPE))); mv.dup(); Index: src/org/jruby/ast/RegexpNode.java =================================================================== --- src/org/jruby/ast/RegexpNode.java (revision 3765) +++ src/org/jruby/ast/RegexpNode.java (working copy) @@ -83,16 +83,16 @@ public int getFlags() { if (pattern == null) { - pattern = translator.translate(value.toString(), options, 0); - flags = translator.flagsFor(options,0); + flags = RegexpTranslator.translateFlags(options); + pattern = translator.translate(value, options, flags); } return flags; } public Pattern getPattern() { if (pattern == null) { - pattern = translator.translate(value.toString(), options, 0); - flags = translator.flagsFor(options,0); + flags = RegexpTranslator.translateFlags(options); + pattern = translator.translate(value, options, flags); } return pattern; } Index: test/test_index =================================================================== --- test/test_index (revision 3765) +++ test/test_index (working copy) @@ -99,6 +99,7 @@ testSymbol.rb testTime.rb testUnboundMethod.rb +testUTF8Regex.rb testVariableAndMethod.rb testVisibility.rb testXML.rb