Index: src/org/jruby/evaluator/EvaluationState.java =================================================================== --- src/org/jruby/evaluator/EvaluationState.java (revision 3480) +++ src/org/jruby/evaluator/EvaluationState.java (working copy) @@ -897,7 +897,7 @@ } try { - return RubyRegexp.newRegexp(runtime, string.toString(), iVisited.getOptions(), lang); + return RubyRegexp.newRegexp(runtime, string.getByteList(), iVisited.getOptions(), lang); } catch(jregex.PatternSyntaxException e) { // System.err.println(iVisited.getValue().toString()); // e.printStackTrace(); Index: src/org/jruby/RubyRegexp.java =================================================================== --- src/org/jruby/RubyRegexp.java (revision 3480) +++ src/org/jruby/RubyRegexp.java (working copy) @@ -174,14 +174,19 @@ } } + public void initialize(ByteList regex, int options) { + try { + pattern = REGEXP_TRANSLATOR.translate(regex, options, code.flags()); + flags = REGEXP_TRANSLATOR.flagsFor(options, code.flags()); + } catch(jregex.PatternSyntaxException e) { + // System.err.println(regex); + // e.printStackTrace(); + throw getRuntime().newRegexpError(e.getMessage()); + } + } + public void initialize(String regex, int options) { try { - if(getCode() == KCode.UTF8) { - try { - regex = new String(ByteList.plain(regex),"UTF8"); - } catch(Exception e) { - } - } pattern = REGEXP_TRANSLATOR.translate(regex, options, code.flags()); flags = REGEXP_TRANSLATOR.flagsFor(options, code.flags()); } catch(jregex.PatternSyntaxException e) { @@ -218,7 +223,7 @@ // Methods of the Regexp class (rb_reg_*): public static RubyRegexp newRegexp(RubyString str, int options, String lang) { - return newRegexp(str.getRuntime(), str.toString(), options, lang); + return newRegexp(str.getRuntime(), str.getByteList(), options, lang); } public static RubyRegexp newRegexp(Ruby runtime, Pattern pattern, int flags, String lang) { @@ -236,6 +241,13 @@ return re; } + public static RubyRegexp newRegexp(Ruby runtime, ByteList str, int options, String kcode) { + RubyRegexp re = new RubyRegexp(runtime); + re.code = KCode.create(runtime, kcode); + re.initialize(str, options); + return re; + } + public static RubyRegexp newInstance(IRubyObject recv, IRubyObject[] args) { RubyClass klass = (RubyClass)recv; @@ -247,10 +259,10 @@ } public IRubyObject initialize(IRubyObject[] args) { - String pat = + ByteList pat = (args[0] instanceof RubyRegexp) - ? ((RubyRegexp) args[0]).source().toString() - : RubyString.stringValue(args[0]).toString(); + ? ((RubyRegexp) args[0]).source().getByteList() + : RubyString.stringValue(args[0]).getByteList(); int opts = 0; if (args.length > 1) { if (args[1] instanceof RubyFixnum) { @@ -782,7 +794,7 @@ public static RubyRegexp unmarshalFrom(UnmarshalStream input) throws java.io.IOException { RubyRegexp result = newRegexp(input.getRuntime(), - RubyString.byteListToString(input.unmarshalString()), input.unmarshalInt(), null); + input.unmarshalString(), input.unmarshalInt(), null); input.registerLinkTarget(result); return result; } Index: src/org/jruby/RegexpTranslator.java =================================================================== --- src/org/jruby/RegexpTranslator.java (revision 3480) +++ src/org/jruby/RegexpTranslator.java (working copy) @@ -1,5 +1,5 @@ /** - * + * */ package org.jruby; @@ -11,91 +11,104 @@ import jregex.TextBuffer; import org.jruby.parser.ReOptions; +import org.jruby.util.ByteList; public class RegexpTranslator { - + private static final Pattern SHARP_IN_CHARACTER_CLASS_PATTERN = new Pattern("(\\[[^]]*)#(.*?])"); - private static final Pattern SPACE_IN_CHARACTER_CLASS_PATTERN = new Pattern("(\\[[^]]*) (.*?])"); - private static final Pattern COMMENT_PATTERN = new Pattern("\\(\\?#[^)]*\\)"); - private static final Pattern COMMENT2_PATTERN = new Pattern("(? 0) { flags |= REFlags.IGNORE_CASE; } if ((options & ReOptions.RE_OPTION_EXTENDED) > 0) { - flags |= REFlags.IGNORE_SPACES; + flags |= REFlags.IGNORE_SPACES; } if ((options & ReOptions.RE_OPTION_MULTILINE) > 0) { - flags |= REFlags.DOTALL; + flags |= REFlags.DOTALL; } - return flags; - } + return flags; + } } Index: src/org/jruby/RubyString.java =================================================================== --- src/org/jruby/RubyString.java (revision 3480) +++ src/org/jruby/RubyString.java (working copy) @@ -2283,13 +2283,14 @@ // get the pattern based on args if (args.length == 0 || args[0].isNil()) { isWhitespace = true; - IRubyObject defaultPattern = runtime.getGlobalVariables().get("$;"); + // FIXME: Is this cast safe? + RubyString defaultPattern = (RubyString)runtime.getGlobalVariables().get("$;"); if (defaultPattern.isNil()) { pattern = RubyRegexp.newRegexp(runtime, "\\s+", 0, null); } else { // FIXME: Is toString correct here? - pattern = RubyRegexp.newRegexp(runtime, defaultPattern.toString(), 0, null); + pattern = RubyRegexp.newRegexp(runtime, defaultPattern.getByteList(), 0, null); } } else if (args[0] instanceof RubyRegexp) { // Even if we have whitespace-only explicit regexp we do not Index: src/org/jruby/parser/ReOptions.java =================================================================== --- src/org/jruby/parser/ReOptions.java (revision 3480) +++ src/org/jruby/parser/ReOptions.java (working copy) @@ -30,12 +30,14 @@ package org.jruby.parser; public interface ReOptions { + // FIXME: Is there a practical reason these weren't just literal values? int RE_OPTION_IGNORECASE = 1; - int RE_OPTION_EXTENDED = (RE_OPTION_IGNORECASE << 1); - int RE_OPTION_MULTILINE = (RE_OPTION_EXTENDED << 1); - int RE_OPTION_SINGLELINE = (RE_OPTION_MULTILINE << 1); + int RE_OPTION_EXTENDED = 2; + int RE_OPTION_MULTILINE = 4; + int RE_OPTION_SINGLELINE = 8; int RE_OPTION_POSIXLINE = (RE_OPTION_MULTILINE | RE_OPTION_SINGLELINE); - int RE_OPTION_LONGEST = (RE_OPTION_SINGLELINE << 1); - int RE_MAY_IGNORECASE = (RE_OPTION_LONGEST << 1); + int RE_OPTION_LONGEST = 16; + int RE_MAY_IGNORECASE = 32; + int RE_UNICODE = 64; int RE_OPTION_ONCE = 0x80; // odd...but it is odd in ruby too. } Index: src/org/jruby/util/ByteList.java =================================================================== --- src/org/jruby/util/ByteList.java (revision 3480) +++ src/org/jruby/util/ByteList.java (working copy) @@ -31,6 +31,7 @@ package org.jruby.util; import java.io.Serializable; +import java.io.UnsupportedEncodingException; /** @@ -583,8 +584,24 @@ } public String toString() { - return new String(plain(this.bytes, begin, realSize)); + try { + return toString("ISO-8859-1"); + } catch (UnsupportedEncodingException uee) { + throw new RuntimeException("ISO-8859-1 encoding should never fail; report this at www.jruby.org"); + } } + + public String toUtf8String() { + try { + return toString("UTF-8"); + } catch (UnsupportedEncodingException uee) { + throw new RuntimeException("UTF-8 encoding should never fail; report this at www.jruby.org"); + } + } + + public String toString(String encoding) throws UnsupportedEncodingException { + return new String(this.bytes, begin, realSize, encoding); + } public static ByteList create(CharSequence s) { return new ByteList(plain(s),false); Index: src/org/jruby/ast/RegexpNode.java =================================================================== --- src/org/jruby/ast/RegexpNode.java (revision 3480) +++ src/org/jruby/ast/RegexpNode.java (working copy) @@ -83,7 +83,7 @@ public int getFlags() { if (pattern == null) { - pattern = translator.translate(value.toString(), options, 0); + pattern = translator.translate(value, options, 0); flags = translator.flagsFor(options,0); } return flags; @@ -91,7 +91,7 @@ public Pattern getPattern() { if (pattern == null) { - pattern = translator.translate(value.toString(), options, 0); + pattern = translator.translate(value, options, 0); flags = translator.flagsFor(options,0); } return pattern;