From afa9699ad2d59ab7770485e6b7a9b70237545eec Mon Sep 17 00:00:00 2001
From: stepan <stepan.sindelar@oracle.com>
Date: Fri, 9 Feb 2018 11:55:10 +0100
Subject: [PATCH] Treat CharSXP as byte array w.r.t. e.g. length

---
 .../r/ffi/impl/interop/CharSXPWrapperMR.java  |  5 ++--
 .../truffle/r/ffi/impl/nodes/MiscNodes.java   |  2 +-
 .../r/runtime/data/CharSXPWrapper.java        | 22 ++++++++++++++--
 .../r/runtime/data/NativeDataAccess.java      | 25 +++++++++++++------
 .../packages/testrffi/testrffi/R/testrffi.R   |  4 +++
 .../packages/testrffi/testrffi/src/init.c     |  1 +
 .../packages/testrffi/testrffi/src/testrffi.c | 14 +++++++++++
 .../packages/testrffi/testrffi/src/testrffi.h |  4 ++-
 .../testrffi/testrffi/tests/simpleTests.R     |  9 ++++++-
 9 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/interop/CharSXPWrapperMR.java b/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/interop/CharSXPWrapperMR.java
index 5ba2936057..95c06bb5e6 100644
--- a/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/interop/CharSXPWrapperMR.java
+++ b/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/interop/CharSXPWrapperMR.java
@@ -71,10 +71,9 @@ public class CharSXPWrapperMR {
 
         protected Object access(CharSXPWrapper receiver, Number indexNum) {
             int index = indexNum.intValue();
-            String contents = receiver.getContents();
-            int len = contents.length();
+            int len = receiver.getLength();
             if (prof1.profile(index < len)) {
-                return contents.charAt(index);
+                return receiver.getByteAt(index);
             } else if (prof2.profile(index == len)) {
                 return 0;
             } else {
diff --git a/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/nodes/MiscNodes.java b/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/nodes/MiscNodes.java
index c42616fccf..25e4a650a1 100644
--- a/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/nodes/MiscNodes.java
+++ b/com.oracle.truffle.r.ffi.impl/src/com/oracle/truffle/r/ffi/impl/nodes/MiscNodes.java
@@ -91,7 +91,7 @@ public final class MiscNodes {
 
         @Specialization
         protected int length(CharSXPWrapper obj) {
-            return obj.getContents().length();
+            return obj.getLength();
         }
 
         @Specialization
diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/CharSXPWrapper.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/CharSXPWrapper.java
index b7291cb6a3..d1d4cab104 100644
--- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/CharSXPWrapper.java
+++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/CharSXPWrapper.java
@@ -22,6 +22,8 @@
  */
 package com.oracle.truffle.r.runtime.data;
 
+import java.nio.charset.StandardCharsets;
+
 import com.oracle.truffle.r.runtime.RRuntime;
 
 /**
@@ -31,12 +33,16 @@ import com.oracle.truffle.r.runtime.RRuntime;
  * FastR already uses {@code String} to denote a length-1 string vector, it cannot be used to
  * represent a {@code CHARSXP}, so this class exists to do so.
  *
+ * As opposed to Strings on the Java side, the native side "Strings" should be treated as array of
+ * bytes. {@link CharSXPWrapper} wraps the byte array, but does not add the '\0' at the end of it.
+ *
  * N.B. Use limited to RFFI implementations.
  */
 public final class CharSXPWrapper extends RObject implements RTruffleObject {
     private static final CharSXPWrapper NA = new CharSXPWrapper(RRuntime.STRING_NA);
 
     private String contents;
+    private byte[] bytes;
 
     private CharSXPWrapper(String contents) {
         this.contents = contents;
@@ -52,8 +58,12 @@ public final class CharSXPWrapper extends RObject implements RTruffleObject {
         return NativeDataAccess.getData(this, contents);
     }
 
+    public byte getByteAt(int index) {
+        return NativeDataAccess.getDataAt(this, getBytes(), index);
+    }
+
     public int getLength() {
-        return NativeDataAccess.getDataLength(this, contents);
+        return NativeDataAccess.getDataLength(this, getBytes());
     }
 
     @Override
@@ -71,9 +81,17 @@ public final class CharSXPWrapper extends RObject implements RTruffleObject {
 
     public long allocateNativeContents() {
         try {
-            return NativeDataAccess.allocateNativeContents(this, contents);
+            return NativeDataAccess.allocateNativeContents(this, getBytes());
         } finally {
             contents = null;
+            bytes = null;
+        }
+    }
+
+    private byte[] getBytes() {
+        if (bytes == null && contents != null) {
+            bytes = contents.getBytes(StandardCharsets.UTF_8);
         }
+        return bytes;
     }
 }
diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/NativeDataAccess.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/NativeDataAccess.java
index 0f60201c90..a1e49b53ab 100644
--- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/NativeDataAccess.java
+++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/data/NativeDataAccess.java
@@ -147,9 +147,8 @@ public final class NativeDataAccess {
         }
 
         @TruffleBoundary
-        void allocateNative(String source) {
+        void allocateNativeString(byte[] bytes) {
             assert dataAddress == 0;
-            byte[] bytes = source.getBytes(StandardCharsets.US_ASCII);
             dataAddress = UnsafeAdapter.UNSAFE.allocateMemory(bytes.length + 1);
             UnsafeAdapter.UNSAFE.copyMemory(bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, null, dataAddress, bytes.length);
             UnsafeAdapter.UNSAFE.putByte(dataAddress + bytes.length, (byte) 0); // C strings
@@ -570,9 +569,21 @@ public final class NativeDataAccess {
         }
     }
 
-    static int getDataLength(CharSXPWrapper vector, String data) {
+    static byte getDataAt(CharSXPWrapper vector, byte[] data, int index) {
         if (noCharSXPNative.isValid() || data != null) {
-            return data.length();
+            return data[index];
+        } else {
+            NativeMirror mirror = (NativeMirror) vector.getNativeMirror();
+            long address = mirror.dataAddress;
+            assert address != 0;
+            assert index < mirror.length;
+            return UnsafeAdapter.UNSAFE.getByte(address + index);
+        }
+    }
+
+    static int getDataLength(CharSXPWrapper vector, byte[] data) {
+        if (noCharSXPNative.isValid() || data != null) {
+            return data.length;
         } else {
             NativeMirror mirror = (NativeMirror) vector.getNativeMirror();
             long address = mirror.dataAddress;
@@ -644,13 +655,13 @@ public final class NativeDataAccess {
         return mirror.dataAddress;
     }
 
-    static long allocateNativeContents(CharSXPWrapper vector, String contents) {
+    static long allocateNativeContents(CharSXPWrapper vector, byte[] data) {
         NativeMirror mirror = (NativeMirror) vector.getNativeMirror();
         assert mirror != null;
-        assert mirror.dataAddress == 0 ^ contents == null;
+        assert mirror.dataAddress == 0 ^ data == null;
         if (mirror.dataAddress == 0) {
             noCharSXPNative.invalidate();
-            mirror.allocateNative(contents);
+            mirror.allocateNativeString(data);
         }
         return mirror.dataAddress;
     }
diff --git a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/R/testrffi.R b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/R/testrffi.R
index 2eef27c513..4f7c1c6af7 100644
--- a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/R/testrffi.R
+++ b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/R/testrffi.R
@@ -207,4 +207,8 @@ rffi.RfEvalWithPromiseInPairList <- function() {
 
 rffi.isNAString <- function(x) {
 	.Call('test_isNAString', x)
+}
+
+rffi.getBytes <- function(x) {
+	.Call('test_getBytes', x)
 }
\ No newline at end of file
diff --git a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/init.c b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/init.c
index d73d58a523..5fe277273e 100644
--- a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/init.c
+++ b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/init.c
@@ -83,6 +83,7 @@ static const R_CallMethodDef CallEntries[] = {
         CALLDEF(test_ParseVector, 1),
         CALLDEF(test_RfEvalWithPromiseInPairList, 0),
         CALLDEF(test_isNAString, 1),
+        CALLDEF(test_getBytes, 1),
         CALLDEF(test_setStringElt, 2),
         {NULL, NULL, 0}
 };
diff --git a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.c b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.c
index d4be694519..e0d8cc1c03 100644
--- a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.c
+++ b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.c
@@ -370,6 +370,20 @@ SEXP test_isNAString(SEXP vec) {
     }
 }
 
+SEXP test_getBytes(SEXP vec) {
+    char* bytes = R_CHAR(STRING_ELT(vec, 0));
+    SEXP result;
+    PROTECT(result = allocVector(RAWSXP, Rf_length(STRING_ELT(vec, 0))));
+    unsigned char* resData = RAW(result);
+    int i = 0;
+    while (*bytes != '\0') {
+        resData[i++] = (unsigned char) *bytes;
+        bytes++;
+    }
+    UNPROTECT(1);
+    return result;
+}
+
 // This function is expected to be called only with environment that has single
 // promise value in the '...' variable and this is asserted inside this function.
 // The return value is list with the promises' expression and environment.
diff --git a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.h b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.h
index 9dddb807ee..44f6cd6d6a 100644
--- a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.h
+++ b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/src/testrffi.h
@@ -112,4 +112,6 @@ extern SEXP test_RfEvalWithPromiseInPairList(void);
 
 extern SEXP test_isNAString(SEXP vec);
 
-extern SEXP test_setStringElt(SEXP vec, SEXP elt);
\ No newline at end of file
+extern SEXP test_setStringElt(SEXP vec, SEXP elt);
+
+extern SEXP test_getBytes(SEXP vec);
\ No newline at end of file
diff --git a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/tests/simpleTests.R b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/tests/simpleTests.R
index 03204306df..d2df53a2af 100644
--- a/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/tests/simpleTests.R
+++ b/com.oracle.truffle.r.test.native/packages/testrffi/testrffi/tests/simpleTests.R
@@ -31,9 +31,16 @@ rffi.LENGTH(strVec)
 rffi.char_length(strVec)
 strVec <- rffi.setStringElt(c('hello'), as.character(NA))
 stopifnot(anyNA(strVec))
-
 stopifnot(rffi.isNAString(as.character(NA)))
 
+# Encoding tests
+rffi.getBytes('\u1F602\n')
+# ignored: FastR does not support explicit encoding yet
+# latinEncStr <- '\xFD\xDD\xD6\xF0\n'
+# Encoding(latinEncStr) <- "latin1"
+# rffi.getBytes(latinEncStr)
+rffi.getBytes('hello ascii')
+
 x <- list(1)
 attr(x, 'myattr') <- 'hello';
 attrs <- rffi.ATTRIB(x)
-- 
GitLab