From 5fb68a58952d9778f6aeea1a86879893d674cf79 Mon Sep 17 00:00:00 2001 From: Tomas Stupka <tomas.stupka@oracle.com> Date: Tue, 10 Jan 2017 12:52:32 +0100 Subject: [PATCH] java.lang.String.split(separator) does not correspond to R strsplit(..., perl=FALSE) --- .../r/nodes/builtin/base/GrepFunctions.java | 33 +++++++++++++++++-- .../r/test/builtins/TestBuiltin_strsplit.java | 25 ++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java index e4eabf251c..fde3ead9ca 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java @@ -1203,7 +1203,7 @@ public class GrepFunctions { if (perl) { resultItem = splitPerl(data, pcreSplits[i % splits.length]); } else { - resultItem = splitIntl(data, currentSplit); + resultItem = splitIntl(data, currentSplit, fixed); } if (resultItem.getLength() == 0) { if (fixed) { @@ -1234,9 +1234,36 @@ public class GrepFunctions { } } - private static RStringVector splitIntl(String input, String separator) { + private static RStringVector splitIntl(String input, String separator, boolean fixed) { assert !RRuntime.isNA(input); - return RDataFactory.createStringVector(input.split(separator), true); + + if (fixed) { + ArrayList<String> matches = new ArrayList<>(); + int idx = input.indexOf(separator); + if (idx < 0) { + return RDataFactory.createStringVector(input); + } + int lastIdx = 0; + while (idx > -1) { + matches.add(input.substring(lastIdx, idx)); + lastIdx = idx + separator.length(); + if (lastIdx > input.length()) { + break; + } + idx = input.indexOf(separator, lastIdx); + } + String m = input.substring(lastIdx); + if (!m.isEmpty()) { + matches.add(m); + } + return RDataFactory.createStringVector(matches.toArray(new String[matches.size()]), false); + } else { + if (input.equals(separator)) { + return RDataFactory.createStringVector(""); + } else { + return RDataFactory.createStringVector(input.split(separator), true); + } + } } private static RStringVector emptySplitIntl(String input) { diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java index f6d29dc8ce..6dd39400b3 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_strsplit.java @@ -121,5 +121,30 @@ public class TestBuiltin_strsplit extends TestBase { assertEval("strsplit('oo bar baz', '[f z]', perl=TRUE)"); assertEval("strsplit('foo \u1010ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄbar baz ', '[f z]', perl=TRUE)"); assertEval("strsplit('Ä Ä', '[ ]', perl=TRUE)"); + + assertEval("strsplit('1', '1', fixed=TRUE)"); + assertEval("strsplit('11', '11', fixed=TRUE)"); + assertEval("strsplit(c('1', '11'), c('1', '11'), fixed=TRUE)"); + assertEval("strsplit('Ä', 'Ä', fixed=TRUE)"); + assertEval("strsplit('ÄÄ', 'Ä', fixed=TRUE)"); + + assertEval("strsplit('1', '1', fixed=FALSE)"); + assertEval("strsplit('11', '11', fixed=FALSE)"); + assertEval("strsplit(c('1', '11'), c('1', '11'), fixed=FALSE)"); + assertEval("strsplit('Ä', 'Ä', fixed=FALSE)"); + assertEval("strsplit('ÄÄ', 'Ä', fixed=FALSE)"); + + assertEval("strsplit(c('111', '1'), c('111', '1'), fixed=TRUE)"); + assertEval("strsplit(c('1', ''), c('1', ''), fixed=TRUE)"); + assertEval("strsplit(c('1', 'b'), c('1', 'b'), fixed=TRUE)"); + assertEval("strsplit(c('a1a', 'a1b'), c('1', '1'), fixed=TRUE)"); + assertEval("strsplit(c('a1a', 'a1b'), '1', fixed=TRUE)"); + + assertEval("strsplit(c('111', '1'), c('111', '1'), fixed=FALSE)"); + assertEval("strsplit(c('1', ''), c('1', ''), fixed=FALSE)"); + assertEval("strsplit(c('1', 'b'), c('1', 'b'), fixed=FALSE)"); + assertEval("strsplit(c('a1a', 'a1b'), c('1', '1'), fixed=FALSE)"); + assertEval("strsplit(c('a1a', 'a1b'), '1', fixed=FALSE)"); + } } -- GitLab