From 5fb68a58952d9778f6aeea1a86879893d674cf79 Mon Sep 17 00:00:00 2001
From: Tomas Stupka <tomas.stupka@oracle.com>
Date: Tue, 10 Jan 2017 12:52:32 +0100
Subject: [PATCH] java.lang.String.split(separator) does not correspond to R
 strsplit(..., perl=FALSE)

---
 .../r/nodes/builtin/base/GrepFunctions.java   | 33 +++++++++++++++++--
 .../r/test/builtins/TestBuiltin_strsplit.java | 25 ++++++++++++++
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java
index e4eabf251c..fde3ead9ca 100644
--- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java
+++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java
@@ -1203,7 +1203,7 @@ public class GrepFunctions {
                         if (perl) {
                             resultItem = splitPerl(data, pcreSplits[i % splits.length]);
                         } else {
-                            resultItem = splitIntl(data, currentSplit);
+                            resultItem = splitIntl(data, currentSplit, fixed);
                         }
                         if (resultItem.getLength() == 0) {
                             if (fixed) {
@@ -1234,9 +1234,36 @@ public class GrepFunctions {
             }
         }
 
-        private static RStringVector splitIntl(String input, String separator) {
+        private static RStringVector splitIntl(String input, String separator, boolean fixed) {
             assert !RRuntime.isNA(input);
-            return RDataFactory.createStringVector(input.split(separator), true);
+
+            if (fixed) {
+                ArrayList<String> matches = new ArrayList<>();
+                int idx = input.indexOf(separator);
+                if (idx < 0) {
+                    return RDataFactory.createStringVector(input);
+                }
+                int lastIdx = 0;
+                while (idx > -1) {
+                    matches.add(input.substring(lastIdx, idx));
+                    lastIdx = idx + separator.length();
+                    if (lastIdx > input.length()) {
+                        break;
+                    }
+                    idx = input.indexOf(separator, lastIdx);
+                }
+                String m = input.substring(lastIdx);
+                if (!m.isEmpty()) {
+                    matches.add(m);
+                }
+                return RDataFactory.createStringVector(matches.toArray(new String[matches.size()]), false);
+            } else {
+                if (input.equals(separator)) {
+                    return RDataFactory.createStringVector("");
+                } else {
+                    return RDataFactory.createStringVector(input.split(separator), true);
+                }
+            }
         }
 
         private static RStringVector emptySplitIntl(String input) {
diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java
index f6d29dc8ce..6dd39400b3 100644
--- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java
+++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java
@@ -121,5 +121,30 @@ public class TestBuiltin_strsplit extends TestBase {
         assertEval("strsplit('oo bar baz', '[f z]', perl=TRUE)");
         assertEval("strsplit('foo \u1010ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄbar baz ', '[f z]', perl=TRUE)");
         assertEval("strsplit('Ä Ä', '[ ]', perl=TRUE)");
+
+        assertEval("strsplit('1', '1', fixed=TRUE)");
+        assertEval("strsplit('11', '11', fixed=TRUE)");
+        assertEval("strsplit(c('1', '11'), c('1', '11'), fixed=TRUE)");
+        assertEval("strsplit('Ä', 'Ä', fixed=TRUE)");
+        assertEval("strsplit('ÄÄ', 'Ä', fixed=TRUE)");
+
+        assertEval("strsplit('1', '1', fixed=FALSE)");
+        assertEval("strsplit('11', '11', fixed=FALSE)");
+        assertEval("strsplit(c('1', '11'), c('1', '11'), fixed=FALSE)");
+        assertEval("strsplit('Ä', 'Ä', fixed=FALSE)");
+        assertEval("strsplit('ÄÄ', 'Ä', fixed=FALSE)");
+
+        assertEval("strsplit(c('111', '1'), c('111', '1'), fixed=TRUE)");
+        assertEval("strsplit(c('1', ''), c('1', ''), fixed=TRUE)");
+        assertEval("strsplit(c('1', 'b'), c('1', 'b'), fixed=TRUE)");
+        assertEval("strsplit(c('a1a', 'a1b'), c('1', '1'), fixed=TRUE)");
+        assertEval("strsplit(c('a1a', 'a1b'), '1', fixed=TRUE)");
+
+        assertEval("strsplit(c('111', '1'), c('111', '1'), fixed=FALSE)");
+        assertEval("strsplit(c('1', ''), c('1', ''), fixed=FALSE)");
+        assertEval("strsplit(c('1', 'b'), c('1', 'b'), fixed=FALSE)");
+        assertEval("strsplit(c('a1a', 'a1b'), c('1', '1'), fixed=FALSE)");
+        assertEval("strsplit(c('a1a', 'a1b'), '1', fixed=FALSE)");
+
     }
 }
-- 
GitLab