diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java index f36dcf3633f70c228f60ac3fcd56f68dd2341ae4..ae5ec546788a7b82dc55dc25c92317349582fcfb 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java @@ -565,6 +565,16 @@ public class GrepFunctions { @RBuiltin(name = "regexpr", kind = INTERNAL, parameterNames = {"pattern", "text", "ignore.case", "perl", "fixed", "useBytes"}, behavior = PURE) public abstract static class Regexp extends CommonCodeAdapter { + protected static final class IndexAndSize { + protected int index; + protected int size; + + public IndexAndSize(int index, int size) { + this.index = index; + this.size = size; + } + } + @Specialization @TruffleBoundary protected Object regexp(RAbstractStringVector patternArg, RAbstractStringVector vector, byte ignoreCaseL, byte perlL, byte fixedL, byte useBytesL) { @@ -572,34 +582,45 @@ public class GrepFunctions { boolean ignoreCase = RRuntime.fromLogical(ignoreCaseL); String pattern = RegExp.checkPreDefinedClasses(patternArg.getDataAt(0)); int[] result = new int[vector.getLength()]; + int[] matchLength = new int[vector.getLength()]; for (int i = 0; i < vector.getLength(); i++) { - result[i] = findIndex(pattern, vector.getDataAt(i), ignoreCase, fixedL == RRuntime.LOGICAL_TRUE).get(0); + IndexAndSize res = findIndexAndSize(pattern, vector.getDataAt(i), ignoreCase, fixedL == RRuntime.LOGICAL_TRUE).get(0); + result[i] = res.index; + matchLength[i] = res.size; } - // TODO attribute as per spec - return RDataFactory.createIntVector(result, RDataFactory.COMPLETE_VECTOR); + // TODO useBytes attribute as per spec + RIntVector ret = RDataFactory.createIntVector(result, RDataFactory.COMPLETE_VECTOR); + ret.setAttr("match.length", RDataFactory.createIntVector(matchLength, RDataFactory.COMPLETE_VECTOR)); + return ret; } - protected static List<Integer> findIndex(String pattern, String text, boolean ignoreCase, boolean fixed) { - List<Integer> list = new ArrayList<>(); + protected static List<IndexAndSize> findIndexAndSize(String pattern, String text, boolean ignoreCase, boolean fixed) { + List<IndexAndSize> list = new ArrayList<>(); if (fixed) { - int index; - if (ignoreCase) { - index = text.toLowerCase().indexOf(pattern.toLowerCase()); - } else { - index = text.indexOf(pattern); + int index = 0; + while (true) { + if (ignoreCase) { + index = text.toLowerCase().indexOf(pattern.toLowerCase(), index); + } else { + index = text.indexOf(pattern, index); + } + if (index == -1) { + break; + } + list.add(new IndexAndSize(index + 1, pattern.length())); + index += pattern.length(); } - list.add(index == -1 ? index : index + 1); } else { Matcher m = getPatternMatcher(pattern, text, ignoreCase); while (m.find()) { // R starts counting at index 1 - list.add(m.start() + 1); - } - if (list.size() > 0) { - return list; + list.add(new IndexAndSize(m.start() + 1, m.end() - m.start())); } - list.add(-1); } + if (list.size() > 0) { + return list; + } + list.add(new IndexAndSize(-1, -1)); return list; } @@ -622,17 +643,22 @@ public class GrepFunctions { boolean fixed = RRuntime.fromLogical(fixedL); Object[] result = new Object[vector.getLength()]; for (int i = 0; i < vector.getLength(); i++) { - int[] data = toIntArray(findIndex(pattern, vector.getDataAt(i), ignoreCase, fixed)); - result[i] = RDataFactory.createIntVector(data, RDataFactory.COMPLETE_VECTOR); - // TODO attributes as per spec + List<IndexAndSize> l = findIndexAndSize(pattern, vector.getDataAt(i), ignoreCase, fixed); + int[] indexes = toIndexOrSizeArray(l, true); + int[] sizes = toIndexOrSizeArray(l, false); + RIntVector res = RDataFactory.createIntVector(indexes, RDataFactory.COMPLETE_VECTOR); + res.setAttr("match.length", RDataFactory.createIntVector(sizes, RDataFactory.COMPLETE_VECTOR)); + result[i] = res; + // TODO useBytes attributes as per spec } return RDataFactory.createList(result); } - private static int[] toIntArray(List<Integer> list) { + private static int[] toIndexOrSizeArray(List<IndexAndSize> list, boolean index) { int[] arr = new int[list.size()]; for (int i = 0; i < list.size(); i++) { - arr[i] = list.get(i); + IndexAndSize res = list.get(i); + arr[i] = index ? res.index : res.size; } return arr; } diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test index 01e3fcd48ae14532a5137cb0329f0b4184b4e46e..747b14fce59ab65bc0324e28d1ca11432e604e52 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test @@ -20692,6 +20692,32 @@ attr(,"useBytes") [1] TRUE +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-gregexpr("foo", c("bar foo foo", "foo"), fixed=F); as.integer(c(x[[1]], x[[2]])) } +[1] 5 9 1 + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-gregexpr("foo", c("bar foo foo", "foo"), fixed=F); list(attr(x[[1]], "match.length"), attr(x[[2]], "match.length")) } +[[1]] +[1] 3 3 + +[[2]] +[1] 3 + + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-gregexpr("foo", c("bar foo foo", "foo"), fixed=T); as.integer(c(x[[1]], x[[2]])) } +[1] 5 9 1 + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-gregexpr("foo", c("bar foo foo", "foo"), fixed=T); list(attr(x[[1]], "match.length"), attr(x[[2]], "match.length")) } +[[1]] +[1] 3 3 + +[[2]] +[1] 3 + + ##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testgregexpr1 #argv <- list('', 'abc', FALSE, FALSE, FALSE, FALSE); .Internal(gregexpr(argv[[1]], argv[[2]], argv[[3]], argv[[4]], argv[[5]], argv[[6]])) [[1]] @@ -40138,6 +40164,14 @@ attr(,"match.length") attr(,"useBytes") [1] TRUE +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ as.integer(regexpr("foo", c("bar foo foo", "foo"), fixed=F)) } +[1] 5 1 + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ as.integer(regexpr("foo", c("bar foo foo", "foo"), fixed=T)) } +[1] 5 1 + ##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr #{ regexpr("aaa", "bbbaaaccc", fixed=TRUE) } [1] 4 @@ -40178,6 +40212,14 @@ attr(,"useBytes") #{ x<-regexpr("aaa", c("bbbaaaccc", "hah"), fixed=TRUE); c(x[1], x[2]) } [1] 4 -1 +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ x<-regexpr("foo", c("bar foo foo", "foo")); attr(x, "match.length") } +[1] 3 3 + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ x<-regexpr("foo", c("bar foo foo", "foo"), fixed=T); attr(x, "match.length") } +[1] 3 3 + ##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testregexpr1 #argv <- list('package:', 'exNSS4', FALSE, FALSE, TRUE, FALSE); .Internal(regexpr(argv[[1]], argv[[2]], argv[[3]], argv[[4]], argv[[5]], argv[[6]])) [1] -1 diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java index 32e9867ee52e84d3ea7a31f20410ae095cd518eb..b4e870f0c06039804a2331ef84ca312cf2109bc2 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java @@ -103,5 +103,10 @@ public class TestBuiltin_gregexpr extends TestBase { assertEval(Ignored.Unknown, "gregexpr(\"e\",c(\"arm\",\"foot\",\"lefroo\", \"bafoobar\"))"); // NOTE: this is without attributes assertEval(Ignored.Unknown, "gregexpr(\"(a)[^a]\\\\1\", c(\"andrea apart\", \"amadeus\", NA))"); + + assertEval("{ x<-gregexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=T); as.integer(c(x[[1]], x[[2]])) }"); + assertEval("{ x<-gregexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=F); as.integer(c(x[[1]], x[[2]])) }"); + assertEval("{ x<-gregexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=T); list(attr(x[[1]], \"match.length\"), attr(x[[2]], \"match.length\")) }"); + assertEval("{ x<-gregexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=F); list(attr(x[[1]], \"match.length\"), attr(x[[2]], \"match.length\")) }"); } } diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java index d0af20a2ac660aa0f515e4c91db64514caa41f54..262c013c9f7371a5e5391860b99479975dbe3450 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java @@ -99,5 +99,10 @@ public class TestBuiltin_regexpr extends TestBase { assertEval("{ x<-regexpr(\"aaa\", c(\"bbbaaaccc\", \"hah\"), fixed=TRUE); c(x[1], x[2]) }"); assertEval("{ x <- \"methods.html\"; pos <- regexpr(\"\\\\.([[:alnum:]]+)$\", x); substring(x, pos + 1L) }"); + + assertEval("{ as.integer(regexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=T)) }"); + assertEval("{ as.integer(regexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=F)) }"); + assertEval("{ x<-regexpr(\"foo\", c(\"bar foo foo\", \"foo\"), fixed=T); attr(x, \"match.length\") }"); + assertEval("{ x<-regexpr(\"foo\", c(\"bar foo foo\", \"foo\")); attr(x, \"match.length\") }"); } }