Index: /Users/ias/workspace/jruby/src/jregex/Pattern.java =================================================================== --- /Users/ias/workspace/jruby/src/jregex/Pattern.java (revision 3721) +++ /Users/ias/workspace/jruby/src/jregex/Pattern.java (working copy) @@ -32,6 +32,8 @@ import java.io.*; import java.util.*; +import org.jruby.util.ByteList; + /** * A handle for a precompiled regular expression.
* To match a regular expression myExpr against a text myString one should first create a Pattern object:
@@ -126,7 +128,6 @@
    * see REFlags
    */
    public Pattern(String regex,String flags) throws PatternSyntaxException{
-      stringRepr=regex;
       compile(regex,parseFlags(flags));
    }
    
@@ -160,6 +161,15 @@
    */
    
    protected void compile(String regex,int flags) throws PatternSyntaxException{
+     try {
+         String regex2 = new String(ByteList.plain(regex),"UTF8");
+//         if (regex.length() != regex2.length()) {
+       regex = regex2;
+//         }
+   } catch (UnsupportedEncodingException e) {
+       // TODO Auto-generated catch block
+       e.printStackTrace();
+   }
       stringRepr=regex;
       Term.makeTree(regex,flags,this);
    }
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java	(revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyMatchData.java	(working copy)
@@ -33,6 +33,8 @@
  ***** END LICENSE BLOCK *****/
 package org.jruby;
 
+import java.io.UnsupportedEncodingException;
+
 import jregex.Matcher;
 import org.jruby.runtime.Arity;
 import org.jruby.runtime.CallbackFactory;
@@ -282,7 +284,9 @@
             // JRUBY-97, but at the same time the testcase remained very slow
             // The additional minor optimizations to RubyString as part of the fix
             // dramatically improve the performance. 
-            return getRuntime().newString(matcher.group((int)n));
+    
+            return RubyString.newUnicodeString(getRuntime(), matcher.group((int)n));
+//            return getRuntime().newString(matcher.group((int)n));
         }
 
         public RubyString pre_match() {
@@ -319,6 +323,17 @@
         public IRubyObject doClone() {
             return new JavaString(getRuntime(), original, matcher);
         }
+        
+        public int matchStartPosition() {
+            int position = 0;
+            try {
+                position = matcher.prefix().getBytes("UTF8").length;
+            } catch (UnsupportedEncodingException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+            return position;
+        }
     }
 
     public static final class RString extends RubyMatchData {
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java	(revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyRegexp.java	(working copy)
@@ -552,6 +552,9 @@
         String t = target;
         if(utf8) {
             try {
+                byte[] bs = ByteList.plain(target);
+                String string = new String(bs, 0, startPos, "UTF8");
+                startPos = string.length();
                 t = new String(ByteList.plain(target),"UTF8");
             } catch(Exception e) {
             }
Index: /Users/ias/workspace/jruby/src/org/jruby/RubyString.java
===================================================================
--- /Users/ias/workspace/jruby/src/org/jruby/RubyString.java	(revision 3721)
+++ /Users/ias/workspace/jruby/src/org/jruby/RubyString.java	(working copy)
@@ -1796,9 +1796,20 @@
             }
 
             if (repl.isTaint()) tainted = true;
-            
             int startZ = mat.start(0);
+            try {
+                startZ = str.substring(0, startZ).getBytes("UTF8").length;
+            } catch (UnsupportedEncodingException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
             int plen = mat.end(0) - startZ; 
+            try {
+                plen = mat.group(0).getBytes("UTF8").length;
+            } catch (UnsupportedEncodingException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
             ByteList replValue = ((RubyString)repl).value;
             
             if (replValue.realSize > plen) { // this might be smarter by being real bytes length aware
@@ -1977,8 +1988,10 @@
         //FIXME may be a problem with pos when doing reverse searches
         int pos = !reverse ? 0 : value.length();
 
+        boolean offset = false;
         if (Arity.checkArgumentCount(getRuntime(), args, 1, 2) == 2) {
             pos = RubyNumeric.fix2int(args[1]);
+            offset = true;
         }
         if (pos < 0) {
             pos += value.length();
@@ -1993,6 +2006,13 @@
             pos = ((RubyRegexp) args[0]).search(toString(), this, reverse ? 0 : pos);
 
             int dummy = pos;
+            if (offset) {
+                dummy = doNotLookPastIfReverse;
+                if (dummy > pos) {
+                    pos = -1;
+                    dummy = -1;
+                }
+            }
             while (reverse && dummy > -1 && dummy <= doNotLookPastIfReverse) {
                 pos = dummy;
                 dummy = ((RubyRegexp) args[0]).search(toString(), this, pos + 1);
@@ -2417,6 +2437,7 @@
             boolean utf8 = false; 
             String str;
             
+            RubyRegexp rr =(RubyRegexp)spat;
             if (runtime.getKCode() == KCode.UTF8) {
                 // We're in UTF8 mode; try to convert the string to UTF8, but fall back on raw bytes if we can't decode
                 // TODO: all this decoder and charset stuff could be centralized...in KCode perhaps?
@@ -2431,11 +2452,12 @@
                     // ignore, just use the unencoded string
                     str = toString();
                 }
-            } else {            
-                str = toString();
+            } else {
+                utf8 = rr.getCode() == KCode.UTF8; 
+                str = toString(utf8);
             }
 
-            Pattern pat = ((RubyRegexp)spat).getPattern();
+            Pattern pat = rr.getPattern();
             Matcher mat = pat.matcher(str);
             beg = 0;
             boolean lastNull = false;