diff --git a/com.oracle.truffle.r.native/fficall/src/jni/pcre_rffi.c b/com.oracle.truffle.r.native/fficall/src/jni/pcre_rffi.c index 40819418b6236b84066ddb157c302cae90fdcfe6..5b4f1bbbec39914f2a96a870e3f5d39824b03e23 100644 --- a/com.oracle.truffle.r.native/fficall/src/jni/pcre_rffi.c +++ b/com.oracle.truffle.r.native/fficall/src/jni/pcre_rffi.c @@ -1,31 +1,27 @@ /* - * Copyright (c) 2016, 2016, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * This material is distributed under the GNU General Public License + * Version 2. You may review the terms of this license at + * http://www.gnu.org/licenses/gpl-2.0.html * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. + * Copyright (c) 1995, 1996 Robert Gentleman and Ross Ihaka + * Copyright (c) 1997-2015, The R Core Team + * Copyright (c) 2016, Oracle and/or its affiliates * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. + * All rights reserved. */ #include <rffiutils.h> +#define PCRE_INFO_CAPTURECOUNT 2 +#define PCRE_INFO_NAMEENTRYSIZE 7 +#define PCRE_INFO_NAMECOUNT 8 +#define PCRE_INFO_NAMETABLE 9 + char *pcre_maketables(); -void *pcre_compile(char * pattern, int options, char ** errorMessage, int *errOffset, char * tables); -int pcre_exec(void * code, void *extra, char* subject, int subjectLength, int startOffset, int options, int *ovector, int ovecSize); +void *pcre_compile(char *pattern, int options, char **errorMessage, int *errOffset, char *tables); +int pcre_exec(void *code, void *extra, char* subject, int subjectLength, int startOffset, int options, int *ovector, int ovecSize); +int pcre_fullinfo(void *code, void *extra, int what, void *where); +void pcre_free(void *code); jclass JNI_PCRE_ResultClass; jmethodID ResultClassConstructorID; @@ -55,8 +51,42 @@ Java_com_oracle_truffle_r_runtime_ffi_jni_JNI_1PCRE_nativeCompile(JNIEnv *env, j } JNIEXPORT jint JNICALL -Java_com_oracle_truffle_r_runtime_ffi_jni_JNI_1PCRE_nativeExec(JNIEnv *env, jclass c,jlong code, jlong extra, jstring subject, - jint startOffset, jint options, jintArray ovector, jint ovectorLen) { +Java_com_oracle_truffle_r_runtime_ffi_jni_JNI_1PCRE_nativeGetCaptureCount(JNIEnv *env, jclass c, jlong code, jlong extra) { + int captureCount; + int rc = pcre_fullinfo(code, extra, PCRE_INFO_CAPTURECOUNT, &captureCount); + return rc < 0 ? rc : captureCount; +} + +JNIEXPORT jint JNICALL +Java_com_oracle_truffle_r_runtime_ffi_jni_JNI_1PCRE_nativeGetCaptureNames(JNIEnv *env, jclass c, jlong code, jlong extra, jobjectArray ret) { + int nameCount; + int nameEntrySize; + char* nameTable; + int res; + res = pcre_fullinfo(code, extra, PCRE_INFO_NAMECOUNT, &nameCount); + if (res < 0) { + return res; + } + res = pcre_fullinfo(code, extra, PCRE_INFO_NAMEENTRYSIZE, &nameEntrySize); + if (res < 0) { + return res; + } + res = pcre_fullinfo(code, extra, PCRE_INFO_NAMETABLE, &nameTable); + if (res < 0) { + return res; + } + // from GNU R's grep.c + for(int i = 0; i < nameCount; i++) { + char* entry = nameTable + nameEntrySize * i; + int captureNum = (entry[0] << 8) + entry[1] - 1; + (*env)->SetObjectArrayElement(env, ret, captureNum, (*env)->NewStringUTF(env, entry + 2)); + } + return res; +} + +JNIEXPORT jint JNICALL +Java_com_oracle_truffle_r_runtime_ffi_jni_JNI_1PCRE_nativeExec(JNIEnv *env, jclass c, jlong code, jlong extra, jstring subject, + jint startOffset, jint options, jintArray ovector, jint ovectorLen) { const char *subjectChars = (*env)->GetStringUTFChars(env, subject, NULL); int subjectLength = (*env)->GetStringUTFLength(env, subject); int* ovectorElems = (*env)->GetIntArrayElements(env, ovector, NULL); diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java index fd868f0d0d96cfaf558de82a801429364cef12bc..2362b5f6c2375305fac5938c285f58487300f1fc 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/GrepFunctions.java @@ -16,6 +16,7 @@ import static com.oracle.truffle.r.runtime.builtins.RBehavior.PURE; import static com.oracle.truffle.r.runtime.builtins.RBuiltinKind.INTERNAL; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -36,6 +37,7 @@ import com.oracle.truffle.r.runtime.data.RAttributeProfiles; import com.oracle.truffle.r.runtime.data.RDataFactory; import com.oracle.truffle.r.runtime.data.RIntVector; import com.oracle.truffle.r.runtime.data.RList; +import com.oracle.truffle.r.runtime.data.RNull; import com.oracle.truffle.r.runtime.data.RStringVector; import com.oracle.truffle.r.runtime.data.model.RAbstractDoubleVector; import com.oracle.truffle.r.runtime.data.model.RAbstractIntVector; @@ -200,6 +202,18 @@ public class GrepFunctions { } return RDataFactory.createIntVector(naData, RDataFactory.INCOMPLETE_VECTOR); } + + protected PCRERFFI.Result compilePerlPattern(String pattern, boolean ignoreCase) { + int cflags = ignoreCase ? PCRERFFI.CASELESS : 0; + long tables = RFFIFactory.getRFFI().getPCRERFFI().maketables(); + PCRERFFI.Result pcre = RFFIFactory.getRFFI().getPCRERFFI().compile(pattern, cflags, tables); + if (pcre.result == 0) { + // TODO output warning if pcre.errorMessage not NULL + throw RError.error(this, RError.Message.INVALID_REGEXP, pattern); + } + return pcre; + } + } private abstract static class GrepAdapter extends CommonCodeAdapter { @@ -226,13 +240,7 @@ public class GrepFunctions { } findAllMatches(matches, pattern, vector, fixed, ignoreCase); } else { - int cflags = ignoreCase ? PCRERFFI.CASELESS : 0; - long tables = RFFIFactory.getRFFI().getPCRERFFI().maketables(); - PCRERFFI.Result pcre = RFFIFactory.getRFFI().getPCRERFFI().compile(pattern, cflags, tables); - if (pcre.result == 0) { - // TODO output warning if pcre.errorMessage not NULL - throw RError.error(this, RError.Message.INVALID_REGEXP, pattern); - } + PCRERFFI.Result pcre = compilePerlPattern(pattern, ignoreCase); // TODO pcre_study for vectors > 10 ? (cf GnuR) int[] ovector = new int[30]; for (int i = 0; i < len; i++) { @@ -398,14 +406,7 @@ public class GrepFunctions { if (fixed) { // TODO case } else if (perl) { - int cflags = ignoreCase ? PCRERFFI.CASELESS : 0; - long tables = RFFIFactory.getRFFI().getPCRERFFI().maketables(); - pcre = RFFIFactory.getRFFI().getPCRERFFI().compile(pattern, cflags, tables); - if (pcre.result == 0) { - // TODO output warning if pcre.errorMessage not NULL - throw RError.error(this, RError.Message.INVALID_REGEXP, pattern); - } - // TODO pcre_study for vectors > 10 ? (cf GnuR) + pcre = compilePerlPattern(pattern, ignoreCase); } else { pattern = RegExp.checkPreDefinedClasses(pattern); } @@ -672,37 +673,106 @@ public class GrepFunctions { castUseBytes(casts); } - protected static final class IndexAndSize { - protected int index; - protected int size; + protected static final class Info { + protected final int index; + protected final int size; + protected final int[] captureStart; + protected final int[] captureLength; + protected final String[] captureNames; + protected final boolean hasCapture; - public IndexAndSize(int index, int size) { + public Info(int index, int size, int[] captureStart, int[] captureLength, String[] captureNames) { this.index = index; this.size = size; + this.captureStart = captureStart; + this.captureLength = captureLength; + this.captureNames = captureNames; + this.hasCapture = captureStart != null && captureLength != null; + } + } + + private static void setNoCaptureValues(int[] captureStart, int[] captureLength, int namesLen, int vecLen, int index) { + for (int j = 0; j < namesLen; j++) { + captureStart[j * vecLen + index] = -1; + captureLength[j * vecLen + index] = -1; } } @Specialization @TruffleBoundary protected Object regexp(RAbstractStringVector patternArg, RAbstractStringVector vector, byte ignoreCaseL, byte perlL, byte fixedL, byte useBytesL) { - checkExtraArgs(RRuntime.LOGICAL_FALSE, perlL, RRuntime.LOGICAL_FALSE, useBytesL, RRuntime.LOGICAL_FALSE); + checkExtraArgs(RRuntime.LOGICAL_FALSE, RRuntime.LOGICAL_FALSE, RRuntime.LOGICAL_FALSE, useBytesL, RRuntime.LOGICAL_FALSE); boolean ignoreCase = RRuntime.fromLogical(ignoreCaseL); - String pattern = RegExp.checkPreDefinedClasses(patternArg.getDataAt(0)); + boolean fixed = RRuntime.fromLogical(fixedL); + boolean perl = RRuntime.fromLogical(perlL); + if (patternArg.getLength() > 1) { + throw RInternalError.unimplemented("multi-element patterns in regexpr not implemented yet"); + } + String pattern = patternArg.getDataAt(0); + if (!perl) { + pattern = RegExp.checkPreDefinedClasses(pattern); + } + // TODO: useBytes normally depends on the value of the parameter and (if false) on + // whether the string is ASCII + boolean useBytes = true; + boolean hasAnyCapture = false; int[] result = new int[vector.getLength()]; int[] matchLength = new int[vector.getLength()]; - for (int i = 0; i < vector.getLength(); i++) { - IndexAndSize res = findIndexAndSize(pattern, vector.getDataAt(i), ignoreCase, fixedL == RRuntime.LOGICAL_TRUE).get(0); - result[i] = res.index; - matchLength[i] = res.size; + String[] captureNames = null; + int[] captureStart = null; + int[] captureLength = null; + if (pattern.length() == 0) { + // emtpy pattern + Arrays.fill(result, 1); + } else { + for (int i = 0; i < vector.getLength(); i++) { + Info res = getInfo(pattern, vector.getDataAt(i), ignoreCase, perl, fixed).get(0); + result[i] = res.index; + matchLength[i] = res.size; + if (res.hasCapture) { + hasAnyCapture = true; + if (captureNames == null) { + // first time we see captures + captureNames = res.captureNames; + captureStart = new int[captureNames.length * vector.getLength()]; + captureLength = new int[captureNames.length * vector.getLength()]; + // previous matches had no capture - fill in result with -1-s + for (int k = 0; k < i; k++) { + setNoCaptureValues(captureStart, captureLength, captureNames.length, vector.getLength(), k); + } + } + assert captureNames.length == res.captureStart.length; + assert captureNames.length == res.captureLength.length; + for (int j = 0; j < captureNames.length; j++) { + captureStart[j * vector.getLength() + i] = res.captureStart[j]; + captureLength[j * vector.getLength() + i] = res.captureLength[j]; + } + } else if (hasAnyCapture) { + // no capture for this part of the vector, but there are previous captures + setNoCaptureValues(captureStart, captureLength, captureNames.length, vector.getLength(), i); + } + } } - // TODO useBytes attribute as per spec RIntVector ret = RDataFactory.createIntVector(result, RDataFactory.COMPLETE_VECTOR); ret.setAttr("match.length", RDataFactory.createIntVector(matchLength, RDataFactory.COMPLETE_VECTOR)); + if (useBytes) { + ret.setAttr("useBytes", RRuntime.LOGICAL_TRUE); + } + if (hasAnyCapture) { + RStringVector captureNamesVec = RDataFactory.createStringVector(captureNames, RDataFactory.COMPLETE_VECTOR); + RIntVector captureStartVec = RDataFactory.createIntVector(captureStart, RDataFactory.COMPLETE_VECTOR, new int[]{vector.getLength(), captureNames.length}); + captureStartVec.setAttr(RRuntime.DIMNAMES_ATTR_KEY, RDataFactory.createList(new Object[]{RNull.instance, captureNamesVec.copy()})); + ret.setAttr("capture.start", captureStartVec); + RIntVector captureLengthVec = RDataFactory.createIntVector(captureLength, RDataFactory.COMPLETE_VECTOR, new int[]{vector.getLength(), captureNames.length}); + captureLengthVec.setAttr(RRuntime.DIMNAMES_ATTR_KEY, RDataFactory.createList(new Object[]{RNull.instance, captureNamesVec.copy()})); + ret.setAttr("capture.length", captureLengthVec); + ret.setAttr("capture.names", captureNamesVec); + } return ret; } - protected static List<IndexAndSize> findIndexAndSize(String pattern, String text, boolean ignoreCase, boolean fixed) { - List<IndexAndSize> list = new ArrayList<>(); + protected List<Info> getInfo(String pattern, String text, boolean ignoreCase, boolean perl, boolean fixed) { + List<Info> list = new ArrayList<>(); if (fixed) { int index = 0; while (true) { @@ -714,20 +784,49 @@ public class GrepFunctions { if (index == -1) { break; } - list.add(new IndexAndSize(index + 1, pattern.length())); + list.add(new Info(index + 1, pattern.length(), null, null, null)); index += pattern.length(); } + } else if (perl) { + PCRERFFI.Result pcre = compilePerlPattern(pattern, ignoreCase); + int maxCaptureCount = RFFIFactory.getRFFI().getPCRERFFI().getCaptureCount(pcre.result, 0); + int[] ovector = new int[(maxCaptureCount + 1) * 3]; + int offset = 0; + while (true) { + int captureCount = RFFIFactory.getRFFI().getPCRERFFI().exec(pcre.result, 0, text, offset, 0, ovector); + if (captureCount >= 0) { + String[] captureNames = RFFIFactory.getRFFI().getPCRERFFI().getCaptureNames(pcre.result, 0, maxCaptureCount); + assert captureCount - 1 == captureNames.length; + int[] captureStart = null; + int[] captureLength = null; + if (captureCount > 1) { + captureStart = new int[captureCount - 1]; + captureLength = new int[captureCount - 1]; + int ind = 0; + for (int i = 2; i < captureCount * 2; i += 2) { + captureStart[ind] = ovector[i] + 1; + captureLength[ind] = ovector[i + 1] - ovector[i]; + ind++; + } + } + // R starts counting at index 1 + list.add(new Info(ovector[0] + 1, ovector[1] - ovector[0], captureStart, captureLength, captureNames)); + offset = ovector[1]; + } else { + break; + } + } } else { Matcher m = getPatternMatcher(pattern, text, ignoreCase); while (m.find()) { // R starts counting at index 1 - list.add(new IndexAndSize(m.start() + 1, m.end() - m.start())); + list.add(new Info(m.start() + 1, m.end() - m.start(), null, null, null)); } } if (list.size() > 0) { return list; } - list.add(new IndexAndSize(-1, -1)); + list.add(new Info(-1, -1, null, null, null)); return list; } @@ -750,35 +849,131 @@ public class GrepFunctions { castUseBytes(casts); } + private static void setNoCaptureAttributes(RIntVector vec, RStringVector captureNames) { + int len = captureNames.getLength(); + int[] captureStartData = new int[len]; + int[] captureLengthData = new int[len]; + Arrays.fill(captureStartData, -1); + Arrays.fill(captureLengthData, -1); + RIntVector captureStart = RDataFactory.createIntVector(captureStartData, RDataFactory.COMPLETE_VECTOR, new int[]{1, captureNames.getLength()}); + captureStart.setAttr(RRuntime.DIMNAMES_ATTR_KEY, RDataFactory.createList(new Object[]{RNull.instance, captureNames.copy()})); + RIntVector captureLength = RDataFactory.createIntVector(captureLengthData, RDataFactory.COMPLETE_VECTOR, new int[]{1, captureNames.getLength()}); + captureLength.setAttr(RRuntime.DIMNAMES_ATTR_KEY, RDataFactory.createList(new Object[]{RNull.instance, captureNames.copy()})); + vec.setAttr("capture.start", captureStart); + vec.setAttr("capture.length", captureLength); + vec.setAttr("capture.names", captureNames); + } + @Specialization @TruffleBoundary @Override protected Object regexp(RAbstractStringVector patternArg, RAbstractStringVector vector, byte ignoreCaseL, byte perlL, byte fixedL, byte useBytesL) { - checkExtraArgs(RRuntime.LOGICAL_FALSE, perlL, RRuntime.LOGICAL_FALSE, useBytesL, RRuntime.LOGICAL_FALSE); + checkExtraArgs(RRuntime.LOGICAL_FALSE, RRuntime.LOGICAL_FALSE, RRuntime.LOGICAL_FALSE, useBytesL, RRuntime.LOGICAL_FALSE); boolean ignoreCase = RRuntime.fromLogical(ignoreCaseL); - String pattern = RegExp.checkPreDefinedClasses(patternArg.getDataAt(0)); boolean fixed = RRuntime.fromLogical(fixedL); + boolean perl = RRuntime.fromLogical(perlL); + if (patternArg.getLength() > 1) { + throw RInternalError.unimplemented("multi-element patterns in gregexpr not implemented yet"); + } + String pattern = patternArg.getDataAt(0); + if (!perl) { + pattern = RegExp.checkPreDefinedClasses(pattern); + } + // TODO: useBytes normally depends on the value of the parameter and (if false) on + // whether the string is ASCII + boolean useBytes = true; Object[] result = new Object[vector.getLength()]; + boolean hasAnyCapture = false; + RStringVector captureNames = null; for (int i = 0; i < vector.getLength(); i++) { - List<IndexAndSize> l = findIndexAndSize(pattern, vector.getDataAt(i), ignoreCase, fixed); - int[] indexes = toIndexOrSizeArray(l, true); - int[] sizes = toIndexOrSizeArray(l, false); - RIntVector res = RDataFactory.createIntVector(indexes, RDataFactory.COMPLETE_VECTOR); - res.setAttr("match.length", RDataFactory.createIntVector(sizes, RDataFactory.COMPLETE_VECTOR)); + RIntVector res; + if (pattern.length() == 0) { + String txt = vector.getDataAt(i); + res = RDataFactory.createIntVector(txt.length()); + for (int j = 0; j < txt.length(); j++) { + res.setDataAt(res.getDataWithoutCopying(), j, j + 1); + } + res.setAttr("match.length", RDataFactory.createIntVector(txt.length())); + if (useBytes) { + res.setAttr("useBytes", RRuntime.LOGICAL_TRUE); + } + } else { + List<Info> l = getInfo(pattern, vector.getDataAt(i), ignoreCase, perl, fixed); + res = toIndexOrSizeVector(l, true); + res.setAttr("match.length", toIndexOrSizeVector(l, false)); + if (useBytes) { + res.setAttr("useBytes", RRuntime.LOGICAL_TRUE); + } + RIntVector captureStart = toCaptureStartOrLength(l, true); + if (captureStart != null) { + RIntVector captureLength = toCaptureStartOrLength(l, false); + assert captureLength != null; + captureNames = getCaptureNamesVector(l); + assert captureNames.getLength() > 0; + if (!hasAnyCapture) { + // set previous result list elements to "no capture" + for (int j = 0; j < i; j++) { + setNoCaptureAttributes((RIntVector) result[j], captureNames); + } + } + hasAnyCapture = true; + res.setAttr("capture.start", captureStart); + res.setAttr("capture.length", captureLength); + res.setAttr("capture.names", captureNames); + } else if (hasAnyCapture) { + assert captureNames != null; + // it's capture names from previous iteration, so copy + setNoCaptureAttributes(res, (RStringVector) captureNames.copy()); + } + } + result[i] = res; - // TODO useBytes attributes as per spec } return RDataFactory.createList(result); } - private static int[] toIndexOrSizeArray(List<IndexAndSize> list, boolean index) { + private static RIntVector toIndexOrSizeVector(List<Info> list, boolean index) { int[] arr = new int[list.size()]; for (int i = 0; i < list.size(); i++) { - IndexAndSize res = list.get(i); + Info res = list.get(i); arr[i] = index ? res.index : res.size; } - return arr; + return RDataFactory.createIntVector(arr, RDataFactory.COMPLETE_VECTOR); } + + private static RIntVector toCaptureStartOrLength(List<Info> list, boolean start) { + assert list.size() > 0; + Info firstInfo = list.get(0); + if (!firstInfo.hasCapture) { + return null; + } + assert firstInfo.captureNames.length > 0; + int[] arr = new int[list.size() * firstInfo.captureNames.length]; + int ind = 0; + for (int i = 0; i < firstInfo.captureNames.length; i++) { + for (int j = 0; j < list.size(); j++) { + Info info = list.get(j); + assert info.captureNames.length == firstInfo.captureNames.length; + assert info.captureStart.length == firstInfo.captureStart.length; + assert info.captureLength.length == firstInfo.captureLength.length; + arr[ind++] = start ? info.captureStart[i] : info.captureLength[i]; + } + } + RIntVector ret = RDataFactory.createIntVector(arr, RDataFactory.COMPLETE_VECTOR, new int[]{list.size(), firstInfo.captureNames.length}); + ret.setAttr(RRuntime.DIMNAMES_ATTR_KEY, RDataFactory.createList(new Object[]{RNull.instance, RDataFactory.createStringVector(firstInfo.captureNames, RDataFactory.COMPLETE_VECTOR)})); + return ret; + } + + private static RStringVector getCaptureNamesVector(List<Info> list) { + assert list.size() > 0; + Info firstInfo = list.get(0); + if (!firstInfo.hasCapture) { + return null; + } + assert firstInfo.captureNames.length > 0; + return RDataFactory.createStringVector(firstInfo.captureNames, RDataFactory.COMPLETE_VECTOR); + } + } @RBuiltin(name = "agrep", kind = INTERNAL, parameterNames = {"pattern", "x", "ignore.case", "value", "costs", "bounds", "useBytes", "fixed"}, behavior = PURE) diff --git a/com.oracle.truffle.r.runtime.ffi/src/com/oracle/truffle/r/runtime/ffi/jni/JNI_PCRE.java b/com.oracle.truffle.r.runtime.ffi/src/com/oracle/truffle/r/runtime/ffi/jni/JNI_PCRE.java index 7a73c6d80b18230d776b2beee7da57ad770a8862..dda34a3c1896e369eb8799ffdd981cb4129d8688 100644 --- a/com.oracle.truffle.r.runtime.ffi/src/com/oracle/truffle/r/runtime/ffi/jni/JNI_PCRE.java +++ b/com.oracle.truffle.r.runtime.ffi/src/com/oracle/truffle/r/runtime/ffi/jni/JNI_PCRE.java @@ -22,6 +22,8 @@ */ package com.oracle.truffle.r.runtime.ffi.jni; +import com.oracle.truffle.api.CompilerDirectives; +import com.oracle.truffle.r.runtime.RError; import com.oracle.truffle.r.runtime.RInternalError; import com.oracle.truffle.r.runtime.ffi.PCRERFFI; @@ -36,6 +38,27 @@ public class JNI_PCRE implements PCRERFFI { return nativeCompile(pattern, options, tables); } + @Override + public int getCaptureCount(long code, long extra) { + int res = nativeGetCaptureCount(code, extra); + if (res < 0) { + CompilerDirectives.transferToInterpreter(); + throw RError.error(RError.NO_CALLER, RError.Message.WRONG_PCRE_INFO, res); + } + return res; + } + + @Override + public String[] getCaptureNames(long code, long extra, int captureCount) { + String[] ret = new String[captureCount]; + int res = nativeGetCaptureNames(code, extra, ret); + if (res < 0) { + CompilerDirectives.transferToInterpreter(); + throw RError.error(RError.NO_CALLER, RError.Message.WRONG_PCRE_INFO, res); + } + return ret; + } + @Override public Result study(long code, int options) { throw RInternalError.unimplemented("pcre_study"); @@ -50,6 +73,10 @@ public class JNI_PCRE implements PCRERFFI { private static native Result nativeCompile(String pattern, int options, long tables); + private static native int nativeGetCaptureCount(long code, long extra); + + private static native int nativeGetCaptureNames(long code, long extra, String[] res); + private static native int nativeExec(long code, long extra, String subject, int offset, int options, int[] ovector, int ovectorLen); diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java index b40ea24d42fed02822279ed54fc5ea7a56390faf..c3b04de6e43e184db9ddd96f26c8d25565e53b29 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java @@ -743,7 +743,8 @@ public final class RError extends RuntimeException { INVALID_FILE_EXT("invalid file extension"), NO("no '%s'"), APPLIES_TO_VECTORS("%s applies only to vectors"), - GAP_MUST_BE_NON_NEGATIVE("'gap' must be non-negative integer"); + GAP_MUST_BE_NON_NEGATIVE("'gap' must be non-negative integer"), + WRONG_PCRE_INFO("'pcre_fullinfo' returned '%d' "); public final String message; final boolean hasArgs; diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/ffi/PCRERFFI.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/ffi/PCRERFFI.java index 99ac9a7ce70492d42ffa1180aa1a603ed450985b..1825446c61a5d71aa069c44c0460febb2b902922 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/ffi/PCRERFFI.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/ffi/PCRERFFI.java @@ -51,6 +51,10 @@ public interface PCRERFFI { Result compile(String pattern, int options, long tables); + int getCaptureCount(long code, long extra); + + String[] getCaptureNames(long code, long extra, int captureCount); + Result study(long code, int options); int exec(long code, long extra, String subject, int offset, int options, int[] ovector); diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test index f2d2e23fb007f43449bc76abe0091e3837d0ef14..1b27c6c499dd36e07d7087cc8bd4a612d98ff0d5 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test @@ -20870,6 +20870,221 @@ Error: invalid 'pattern' argument #{ .Internal(gregexpr(character(), "42", F, F, F, F)) } Error: invalid 'pattern' argument +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ argv <- structure(list(pattern = '', text = c('abc', 'defg'), perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('gregexpr', argv) } +[[1]] +[1] 1 2 3 +attr(,"match.length") +[1] 0 0 0 +attr(,"useBytes") +[1] TRUE + +[[2]] +[1] 1 2 3 4 +attr(,"match.length") +[1] 0 0 0 0 +attr(,"useBytes") +[1] TRUE + + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-c("Aaa Bbb Aaa bbb", "Aaa Bbb Aaa Bbb", "Aaa bbb Aaa bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; gregexpr(p, x, perl=TRUE) } +[[1]] +[1] 1 +attr(,"match.length") +[1] 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +attr(,"capture.length") + first last +[1,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +[[2]] +[1] 1 9 +attr(,"match.length") +[1] 7 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +[2,] 9 13 +attr(,"capture.length") + first last +[1,] 3 3 +[2,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +[[3]] +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-c("Aaa bbb Aaa Bbb", "Aaa bbb Aaa bbb", "Aaa bbb Aaa Bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; gregexpr(p, x, perl=TRUE) } +[[1]] +[1] 9 +attr(,"match.length") +[1] 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 9 13 +attr(,"capture.length") + first last +[1,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +[[2]] +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + +[[3]] +[1] 9 +attr(,"match.length") +[1] 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 9 13 +attr(,"capture.length") + first last +[1,] 3 3 +attr(,"capture.names") +[1] "first" "last" + + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-c("Aaa bbb Aaa bbb", "Aaa Bbb Aaa Bbb", "Aaa Bbb Aaa bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; gregexpr(p, x, perl=TRUE) } +[[1]] +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + +[[2]] +[1] 1 9 +attr(,"match.length") +[1] 7 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +[2,] 9 13 +attr(,"capture.length") + first last +[1,] 3 3 +[2,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +[[3]] +[1] 1 +attr(,"match.length") +[1] 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +attr(,"capture.length") + first last +[1,] 3 3 +attr(,"capture.names") +[1] "first" "last" + + +##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr +#{ x<-c("Aaa bbb Aaa bbb", "Aaa Bbb Aaa Bbb", "Aaa bbb Aaa bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; gregexpr(p, x, perl=TRUE) } +[[1]] +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + +[[2]] +[1] 1 9 +attr(,"match.length") +[1] 7 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +[2,] 9 13 +attr(,"capture.length") + first last +[1,] 3 3 +[2,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +[[3]] +[1] -1 +attr(,"match.length") +[1] -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + + ##com.oracle.truffle.r.test.builtins.TestBuiltin_gregexpr.testRegExpr #{ x<-gregexpr("foo", c("bar foo foo", "foo"), fixed=F); as.integer(c(x[[1]], x[[2]])) } [1] 5 9 1 @@ -40845,6 +41060,14 @@ Error: invalid 'pattern' argument #{ .Internal(regexpr(character(), "42", F, F, F, F)) } Error: invalid 'pattern' argument +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ argv <- structure(list(pattern = '', text = c('abc', 'defg'), perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('regexpr', argv) } +[1] 1 1 +attr(,"match.length") +[1] 0 0 +attr(,"useBytes") +[1] TRUE + ##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr #{ as.integer(regexpr("foo", c("bar foo foo", "foo"), fixed=F)) } [1] 5 1 @@ -40881,6 +41104,62 @@ attr(,"useBytes") #{ x <- "methods.html"; pos <- regexpr("\\.([[:alnum:]]+)$", x); substring(x, pos + 1L) } [1] "html" +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ x<-c("Aaa Bbb Aaa Bbb", "Aaa bbb Aaa bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; regexpr(p, x, perl=TRUE) } +[1] 1 -1 +attr(,"match.length") +[1] 7 -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] 1 5 +[2,] -1 -1 +attr(,"capture.length") + first last +[1,] 3 3 +[2,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ x<-c("Aaa bbb Aaa bbb", "Aaa Bbb Aaa Bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; regexpr(p, x, perl=TRUE) } +[1] -1 1 +attr(,"match.length") +[1] -1 7 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +[2,] 1 5 +attr(,"capture.length") + first last +[1,] -1 -1 +[2,] 3 3 +attr(,"capture.names") +[1] "first" "last" + +##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr +#{ x<-c("Aaa bbb Aaa bbb", "Aaa Bbb Aaa Bbb", "Aaa bbb Aaa bbb"); p<-"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"; regexpr(p, x, perl=TRUE) } +[1] -1 1 -1 +attr(,"match.length") +[1] -1 7 -1 +attr(,"useBytes") +[1] TRUE +attr(,"capture.start") + first last +[1,] -1 -1 +[2,] 1 5 +[3,] -1 -1 +attr(,"capture.length") + first last +[1,] -1 -1 +[2,] 3 3 +[3,] -1 -1 +attr(,"capture.names") +[1] "first" "last" + ##com.oracle.truffle.r.test.builtins.TestBuiltin_regexpr.testRegExpr #{ x<-regexpr("aaa", "bbbaaaccc", fixed=TRUE); c(x[1]) } [1] 4 diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java index 1aba62e2999b9cd6221a8453a40b3edbde6da975..385a56c1e09aeb467b22eb4ca7bf8127afb2056d 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_gregexpr.java @@ -94,7 +94,7 @@ public class TestBuiltin_gregexpr extends TestBase { @Test public void testgregexpr16() { - assertEval(Ignored.Unknown, "argv <- structure(list(pattern = '', text = 'abc', perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('gregexpr', argv)"); + assertEval("argv <- structure(list(pattern = '', text = 'abc', perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('gregexpr', argv)"); } @Test @@ -113,5 +113,11 @@ public class TestBuiltin_gregexpr extends TestBase { assertEval("{ .Internal(gregexpr(character(), \"42\", F, F, F, F)) }"); assertEval("{ .Internal(gregexpr(\"7\", 42, F, F, F, F)) }"); + assertEval("{ argv <- structure(list(pattern = '', text = c('abc', 'defg'), perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('gregexpr', argv) }"); + assertEval("{ x<-c(\"Aaa Bbb Aaa bbb\", \"Aaa Bbb Aaa Bbb\", \"Aaa bbb Aaa bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; gregexpr(p, x, perl=TRUE) }"); + assertEval("{ x<-c(\"Aaa bbb Aaa bbb\", \"Aaa Bbb Aaa Bbb\", \"Aaa Bbb Aaa bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; gregexpr(p, x, perl=TRUE) }"); + assertEval("{ x<-c(\"Aaa bbb Aaa Bbb\", \"Aaa bbb Aaa bbb\", \"Aaa bbb Aaa Bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; gregexpr(p, x, perl=TRUE) }"); + assertEval("{ x<-c(\"Aaa bbb Aaa bbb\", \"Aaa Bbb Aaa Bbb\", \"Aaa bbb Aaa bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; gregexpr(p, x, perl=TRUE) }"); + } } diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java index ae591ffaba1c26eaffc1b4ff728eb1d2ce6cd671..f3b05cff87b05576d2f6116b7eb9b9d2cff816a7 100644 --- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java +++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/builtins/TestBuiltin_regexpr.java @@ -108,5 +108,11 @@ public class TestBuiltin_regexpr extends TestBase { assertEval("{ .Internal(regexpr(7, \"42\", F, F, F, F)) }"); assertEval("{ .Internal(regexpr(character(), \"42\", F, F, F, F)) }"); assertEval("{ .Internal(regexpr(\"7\", 42, F, F, F, F)) }"); + + assertEval("{ argv <- structure(list(pattern = '', text = c('abc', 'defg'), perl = TRUE), .Names = c('pattern', 'text', 'perl'));do.call('regexpr', argv) }"); + assertEval("{ x<-c(\"Aaa Bbb Aaa Bbb\", \"Aaa bbb Aaa bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; regexpr(p, x, perl=TRUE) }"); + assertEval("{ x<-c(\"Aaa bbb Aaa bbb\", \"Aaa Bbb Aaa Bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; regexpr(p, x, perl=TRUE) }"); + assertEval("{ x<-c(\"Aaa bbb Aaa bbb\", \"Aaa Bbb Aaa Bbb\", \"Aaa bbb Aaa bbb\"); p<-\"(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)\"; regexpr(p, x, perl=TRUE) }"); + } } diff --git a/mx.fastr/copyrights/overrides b/mx.fastr/copyrights/overrides index a320da9b757b0b923f6a4db32d79e5bbcf7c729b..3632ac9ec92c7ba3cd9613b7635ab2f614a23ae2 100644 --- a/mx.fastr/copyrights/overrides +++ b/mx.fastr/copyrights/overrides @@ -76,6 +76,7 @@ com.oracle.truffle.r.native/fficall/src/include/nmath.h,gnu_r.copyright com.oracle.truffle.r.native/fficall/src/include/rlocale.h,gnu_r_gentleman_ihaka.copyright com.oracle.truffle.r.native/fficall/src/variable_defs/variable_defs.h,gnu_r.copyright com.oracle.truffle.r.native/fficall/src/jni/Memory.c,gnu_r.copyright +com.oracle.truffle.r.native/fficall/src/jni/pcre_rffi.c,gnu_r_gentleman_ihaka2.copyright com.oracle.truffle.r.native/fficall/src/jni/Rdynload_fastr.c,gnu_r.copyright com.oracle.truffle.r.native/fficall/src/jni/Rembedded.c,gnu_r.copyright com.oracle.truffle.r.native/include/src/libintl.h,no.copyright