diff --git a/com.oracle.truffle.r.native/fficall/src/truffle_common/pcre_rffi.c b/com.oracle.truffle.r.native/fficall/src/truffle_common/pcre_rffi.c index f70dca58caa5a535c9e99280f153e008bb04bde1..2f6e4e5b5ab9cc87180cc5056c8a66359eb3a880 100644 --- a/com.oracle.truffle.r.native/fficall/src/truffle_common/pcre_rffi.c +++ b/com.oracle.truffle.r.native/fficall/src/truffle_common/pcre_rffi.c @@ -74,7 +74,7 @@ int call_pcre_getcapturenames(void (*setcapturename)(int i, char *name), long co if (res < 0) { return res; } - // from GNU R's grep.c + // from GNU R's grep.c function do_regexpr() for(int i = 0; i < nameCount; i++) { char* entry = nameTable + nameEntrySize * i; int captureNum = (entry[0] << 8) + entry[1] - 1; diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java index 6b3b9ef8cec8136a97daf6675ac75278faad1014..2799b9c753295d5c511ebfbe90c87763da88e9bb 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java @@ -765,6 +765,7 @@ public class GrepFunctions { protected final int[] captureLength; protected final String[] captureNames; protected final boolean hasCapture; + protected final boolean hasCaptureResult; public Info(int index, int size, int[] captureStart, int[] captureLength, String[] captureNames) { this.index = index; @@ -773,6 +774,7 @@ public class GrepFunctions { this.captureLength = captureLength; this.captureNames = captureNames; this.hasCapture = captureStart != null && captureLength != null; + this.hasCaptureResult = captureNames != null && captureNames.length > 0; } } @@ -799,7 +801,7 @@ public class GrepFunctions { // TODO: useBytes normally depends on the value of the parameter and (if false) on // whether the string is ASCII boolean useBytes = true; - boolean hasAnyCapture = false; + boolean hasCaptureResult = false; int[] result = new int[vector.getLength()]; int[] matchLength = new int[vector.getLength()]; String[] captureNames = null; @@ -810,30 +812,36 @@ public class GrepFunctions { Arrays.fill(result, 1); } else { for (int i = 0; i < vector.getLength(); i++) { - Info res = getInfo(common, pattern, vector.getDataAt(i), ignoreCase, perl, fixed).get(0); + Info res = getInfo(common, pattern, vector.getDataAt(i), ignoreCase, perl, fixed, true).get(0); result[i] = res.index; matchLength[i] = res.size; if (res.hasCapture) { - hasAnyCapture = true; + hasCaptureResult = true; if (captureNames == null) { // first time we see captures captureNames = res.captureNames; + // length of res.captureNames gives the max amount of captures captureStart = new int[captureNames.length * vector.getLength()]; captureLength = new int[captureNames.length * vector.getLength()]; - // previous matches had no capture - fill in result with -1-s - for (int k = 0; k < i; k++) { - setNoCaptureValues(captureStart, captureLength, captureNames.length, vector.getLength(), k); - } } - assert captureNames.length == res.captureStart.length; - assert captureNames.length == res.captureLength.length; - for (int j = 0; j < captureNames.length; j++) { + assert captureNames.length == res.captureStart.length || captureNames.length - 1 == res.captureStart.length : captureNames.length + ", " + res.captureStart.length; + assert captureNames.length == res.captureLength.length || captureNames.length - 1 == res.captureLength.length : captureNames.length + ", " + res.captureLength.length; + for (int j = 0; j < res.captureStart.length; j++) { + // well, res.captureStart might be shorter then + // res.captureNames (but never more then by 1?), + // just ignore the remaining (zero) elements in captureStart captureStart[j * vector.getLength() + i] = res.captureStart[j]; captureLength[j * vector.getLength() + i] = res.captureLength[j]; } - } else if (hasAnyCapture) { - // no capture for this part of the vector, but there are previous - // captures + } else if (res.hasCaptureResult) { + // no capture for this part of the vector, but even then + // we want to return a "no capture" result + hasCaptureResult = true; + captureNames = res.captureNames; + if (captureStart == null) { + captureStart = new int[captureNames.length * vector.getLength()]; + captureLength = new int[captureNames.length * vector.getLength()]; + } setNoCaptureValues(captureStart, captureLength, captureNames.length, vector.getLength(), i); } } @@ -843,7 +851,7 @@ public class GrepFunctions { if (useBytes) { setUseBytesAttrNode.execute(ret, RRuntime.LOGICAL_TRUE); } - if (hasAnyCapture) { + if (hasCaptureResult) { RStringVector captureNamesVec = RDataFactory.createStringVector(captureNames, RDataFactory.COMPLETE_VECTOR); RIntVector captureStartVec = RDataFactory.createIntVector(captureStart, RDataFactory.COMPLETE_VECTOR, new int[]{vector.getLength(), captureNames.length}); setDimNamesAttrNode.execute(captureStartVec, RDataFactory.createList(new Object[]{RNull.instance, captureNamesVec.copy()})); @@ -860,6 +868,10 @@ public class GrepFunctions { } protected List<Info> getInfo(CommonCodeNode common, String pattern, String text, boolean ignoreCase, boolean perl, boolean fixed) { + return getInfo(common, pattern, text, ignoreCase, perl, fixed, false); + } + + protected List<Info> getInfo(CommonCodeNode common, String pattern, String text, boolean ignoreCase, boolean perl, boolean fixed, boolean onlyFirst) { List<Info> list = new ArrayList<>(); if (fixed) { int index = 0; @@ -878,18 +890,25 @@ public class GrepFunctions { } else if (perl) { PCRERFFI.Result pcre = common.compilePerlPattern(pattern, ignoreCase); int maxCaptureCount = getCaptureCountNode.execute(pcre.result, 0); + if (maxCaptureCount < 0) { + throw error(Message.PCRE_FULLINFO_RETURNED, maxCaptureCount); + } + + String[] captureNames = getCaptureNamesNode.execute(pcre.result, 0, maxCaptureCount); + assert maxCaptureCount == captureNames.length; + for (int i = 0; i < captureNames.length; i++) { + if (captureNames[i] == null) { + captureNames[i] = ""; + } + } + int[] ovector = new int[(maxCaptureCount + 1) * 3]; int offset = 0; while (true) { int captureCount = execNode.execute(pcre.result, 0, text, offset, 0, ovector); if (captureCount >= 0) { - String[] captureNames = getCaptureNamesNode.execute(pcre.result, 0, maxCaptureCount); - for (int i = 0; i < captureNames.length; i++) { - if (captureNames[i] == null) { - captureNames[i] = ""; - } - } - assert captureCount - 1 == captureNames.length; + assert captureCount - 1 == captureNames.length || captureCount == captureNames.length : captureCount + ", " + captureNames.length; + int[] captureStart = null; int[] captureLength = null; if (captureCount > 1) { @@ -904,11 +923,18 @@ public class GrepFunctions { } // R starts counting at index 1 list.add(new Info(ovector[0] + 1, ovector[1] - ovector[0], captureStart, captureLength, captureNames)); + if (onlyFirst) { + break; + } offset = ovector[1]; } else { break; } } + if (list.isEmpty() && maxCaptureCount > 0) { + // at least a return array of emtpty string names, is necessary for output + list.add(new Info(-1, -1, null, null, captureNames)); + } } else { Matcher m = getPatternMatcher(pattern, text, ignoreCase); while (m.find()) { diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java index 1253fd0cf72ca97ed5573ce9832a9b6545c36525..6238dd5937e4efcf9bef23976c01d12b20722e75 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java @@ -645,6 +645,7 @@ public final class RError extends RuntimeException implements TruffleException { FIRST_ARGUMENT_MUST_BE_CHARACTER("the first argument must be of mode character"), ALL_ATTRIBUTES_NAMES("all attributes must have names [%d does not]"), INVALID_REGEXP("invalid regular expression '%s'"), + PCRE_FULLINFO_RETURNED("'pcre_fullinfo' returned '%s'"), INVALID_REGEXP_REASON("invalid regular expression '%s': %s"), COERCING_ARGUMENT("coercing argument of type '%s' to %s"), MUST_BE_TRUE_FALSE_ENVIRONMENT("'%s' must be TRUE, FALSE or an environment"), diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test index d1fe132529d3ad8f8f3dc5aa0036a2c8d7fcb8a2..663de5d632ac7d8f778b957f96ff41d8506483bd 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test @@ -50732,6 +50732,218 @@ attr(,"useBytes") [1] TRUE +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^((.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 1 1 +attr(,"capture.length") + +[1,] 2 2 +attr(,"capture.names") +[1] "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(([A-Z)|([a-z]))$", 'Aa', perl=T) +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] -1 -1 +attr(,"capture.length") + +[1,] -1 -1 +attr(,"capture.names") +[1] "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(([A-Z)|([a-z]))$", c('A', 'Aa'), perl=T) +[1] 1 -1 +attr(,"match.length") +[1] 1 -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 1 1 +[2,] -1 -1 +attr(,"capture.length") + +[1,] 1 1 +[2,] -1 -1 +attr(,"capture.names") +[1] "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(([A-Z)|([a-z]))$", c('Aa', 'A'), perl=T) +[1] -1 1 +attr(,"match.length") +[1] -1 1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] -1 -1 +[2,] 1 1 +attr(,"capture.length") + +[1,] -1 -1 +[2,] 1 1 +attr(,"capture.names") +[1] "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(.*)$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 1 +attr(,"capture.length") + +[1,] 2 +attr(,"capture.names") +[1] "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?:(?:^\\[([^\\]]+)\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1 A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 5 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 0 0 0 1 +attr(,"capture.length") + +[1,] 0 0 0 5 +attr(,"capture.names") +[1] "" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?:(?:^\\[([^\\]]+)\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 0 0 1 0 +attr(,"capture.length") + +[1,] 0 0 2 0 +attr(,"capture.names") +[1] "" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?:(?:^\\[([^\\]]+)\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 4 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + +[1,] 0 0 1 0 +attr(,"capture.length") + +[1,] 0 0 4 0 +attr(,"capture.names") +[1] "" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?:(?:^\\[([^\\]]+)\\])?(?<n2>:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + n2 +[1,] 0 0 0 1 0 +attr(,"capture.length") + n2 +[1,] 0 0 0 2 0 +attr(,"capture.names") +[1] "" "n2" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?<n1>:(?:^\\[([^\\]]+)\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + n1 +[1,] 1 0 0 0 1 +attr(,"capture.length") + n1 +[1,] 2 0 0 0 2 +attr(,"capture.names") +[1] "n1" "" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?<n1>:(?:^\\[([^\\]]+)\\])?(?<n2>:'?([^']+)'?!)?([a-zA-Z0-9:\\-$\\[\\]]+)|(.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + n1 n2 +[1,] 1 0 0 0 0 1 +attr(,"capture.length") + n1 n2 +[1,] 2 0 0 0 0 2 +attr(,"capture.names") +[1] "n1" "" "n2" "" "" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?<n>(.*))$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + n +[1,] 1 1 +attr(,"capture.length") + n +[1,] 2 2 +attr(,"capture.names") +[1] "n" "" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexec.testregexec1# +#regexpr("^(?<n>.*)$", 'A1', perl=T) +[1] 1 +attr(,"match.length") +[1] 2 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + n +[1,] 1 +attr(,"capture.length") + n +[1,] 2 +attr(,"capture.names") +[1] "n" + ##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr#Ignored.ImplementationError# #regexpr("(a)[^a]\\1", c("andrea apart", "amadeus", NA)) [1] 6 1 NA diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexec.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexec.java index a65117eb42687fd2a04b1fe23f7aa9c2c2f0f5e5..7867572091284915835d72acea2eaa1150f8b545 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexec.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexec.java @@ -25,5 +25,20 @@ public class TestBuiltin_regexec extends TestBase { // docs do not mention that so ReferenceError for now. assertEval(Ignored.ReferenceError, "argv <- list('^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)', 'http://stat.umn.edu:80/xyz', FALSE, FALSE, FALSE); .Internal(regexec(argv[[1]], argv[[2]], argv[[3]], argv[[4]], argv[[5]]))"); + + assertEval("regexpr(\"^(?:(?:^\\\\[([^\\\\]]+)\\\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(?:(?:^\\\\[([^\\\\]]+)\\\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1A1', perl=T)"); + assertEval("regexpr(\"^(?:(?:^\\\\[([^\\\\]]+)\\\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1 A1', perl=T)"); + assertEval("regexpr(\"^(?<n1>:(?:^\\\\[([^\\\\]]+)\\\\])?(?:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(?<n1>:(?:^\\\\[([^\\\\]]+)\\\\])?(?<n2>:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(?:(?:^\\\\[([^\\\\]]+)\\\\])?(?<n2>:'?([^']+)'?!)?([a-zA-Z0-9:\\\\-$\\\\[\\\\]]+)|(.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^((.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(?<n>(.*))$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(.*)$\", 'A1', perl=T)"); + assertEval("regexpr(\"^(?<n>.*)$\", 'A1', perl=T)"); + + assertEval("regexpr(\"^(([A-Z)|([a-z]))$\", 'Aa', perl=T)"); + assertEval("regexpr(\"^(([A-Z)|([a-z]))$\", c('A', 'Aa'), perl=T)"); + assertEval("regexpr(\"^(([A-Z)|([a-z]))$\", c('Aa', 'A'), perl=T)"); } }