Index: /Users/ias/workspace/jruby/src/jregex/Pattern.java
===================================================================
--- /Users/ias/workspace/jruby/src/jregex/Pattern.java (revision 3721)
+++ /Users/ias/workspace/jruby/src/jregex/Pattern.java (working copy)
@@ -32,6 +32,8 @@
import java.io.*;
import java.util.*;
+import org.jruby.util.ByteList;
+
/**
* A handle for a precompiled regular expression.
* To match a regular expression myExpr against a text myString one should first create a Pattern object:
@@ -126,7 +128,6 @@
* see REFlags
*/
public Pattern(String regex,String flags) throws PatternSyntaxException{
- stringRepr=regex;
compile(regex,parseFlags(flags));
}
@@ -160,6 +161,15 @@
*/
protected void compile(String regex,int flags) throws PatternSyntaxException{
+ try {
+ String regex2 = new String(ByteList.plain(regex),"UTF8");
+// if (regex.length() != regex2.length()) {
+ regex = regex2;
+// }
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
stringRepr=regex;
Term.makeTree(regex,flags,this);
}
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java (revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java (working copy)
@@ -33,6 +33,8 @@
***** END LICENSE BLOCK *****/
package org.jruby;
+import java.io.UnsupportedEncodingException;
+
import jregex.Matcher;
import org.jruby.runtime.Arity;
import org.jruby.runtime.CallbackFactory;
@@ -282,7 +284,9 @@
// JRUBY-97, but at the same time the testcase remained very slow
// The additional minor optimizations to RubyString as part of the fix
// dramatically improve the performance.
- return getRuntime().newString(matcher.group((int)n));
+
+ return RubyString.newUnicodeString(getRuntime(), matcher.group((int)n));
+// return getRuntime().newString(matcher.group((int)n));
}
public RubyString pre_match() {
@@ -319,6 +323,17 @@
public IRubyObject doClone() {
return new JavaString(getRuntime(), original, matcher);
}
+
+ public int matchStartPosition() {
+ int position = 0;
+ try {
+ position = matcher.prefix().getBytes("UTF8").length;
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return position;
+ }
}
public static final class RString extends RubyMatchData {
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java (revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java (working copy)
@@ -552,6 +552,9 @@
String t = target;
if(utf8) {
try {
+ byte[] bs = ByteList.plain(target);
+ String string = new String(bs, 0, startPos, "UTF8");
+ startPos = string.length();
t = new String(ByteList.plain(target),"UTF8");
} catch(Exception e) {
}
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyString.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyString.java (revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyString.java (working copy)
@@ -1796,9 +1796,20 @@
}
if (repl.isTaint()) tainted = true;
-
int startZ = mat.start(0);
+ try {
+ startZ = str.substring(0, startZ).getBytes("UTF8").length;
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
int plen = mat.end(0) - startZ;
+ try {
+ plen = mat.group(0).getBytes("UTF8").length;
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
ByteList replValue = ((RubyString)repl).value;
if (replValue.realSize > plen) { // this might be smarter by being real bytes length aware
@@ -1977,8 +1988,10 @@
//FIXME may be a problem with pos when doing reverse searches
int pos = !reverse ? 0 : value.length();
+ boolean offset = false;
if (Arity.checkArgumentCount(getRuntime(), args, 1, 2) == 2) {
pos = RubyNumeric.fix2int(args[1]);
+ offset = true;
}
if (pos < 0) {
pos += value.length();
@@ -1993,6 +2006,13 @@
pos = ((RubyRegexp) args[0]).search(toString(), this, reverse ? 0 : pos);
int dummy = pos;
+ if (offset) {
+ dummy = doNotLookPastIfReverse;
+ if (dummy > pos) {
+ pos = -1;
+ dummy = -1;
+ }
+ }
while (reverse && dummy > -1 && dummy <= doNotLookPastIfReverse) {
pos = dummy;
dummy = ((RubyRegexp) args[0]).search(toString(), this, pos + 1);
@@ -2417,6 +2437,7 @@
boolean utf8 = false;
String str;
+ RubyRegexp rr =(RubyRegexp)spat;
if (runtime.getKCode() == KCode.UTF8) {
// We're in UTF8 mode; try to convert the string to UTF8, but fall back on raw bytes if we can't decode
// TODO: all this decoder and charset stuff could be centralized...in KCode perhaps?
@@ -2431,11 +2452,12 @@
// ignore, just use the unencoded string
str = toString();
}
- } else {
- str = toString();
+ } else {
+ utf8 = rr.getCode() == KCode.UTF8;
+ str = toString(utf8);
}
- Pattern pat = ((RubyRegexp)spat).getPattern();
+ Pattern pat = rr.getPattern();
Matcher mat = pat.matcher(str);
beg = 0;
boolean lastNull = false;