From 00e5b51c56323bdb3c24ec58d04a93233255996a Mon Sep 17 00:00:00 2001
From: Miloslav Metelka <miloslav.metelka@oracle.com>
Date: Mon, 19 Mar 2018 10:36:19 +0100
Subject: [PATCH] ReadTableHead improvements and tests added.

---
 .../builtin/base/foreign/ReadTableHead.java   | 240 ++++++++++++++++--
 .../com/oracle/truffle/r/runtime/RError.java  |   3 +-
 .../truffle/r/test/ExpectedTestOutput.test    |  79 ++++++
 .../r/test/library/utils/TestReadTable.java   |  49 ++++
 4 files changed, 349 insertions(+), 22 deletions(-)
 create mode 100644 com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/library/utils/TestReadTable.java

diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/foreign/ReadTableHead.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/foreign/ReadTableHead.java
index 8d14c507c3..700ccf32f9 100644
--- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/foreign/ReadTableHead.java
+++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/foreign/ReadTableHead.java
@@ -46,38 +46,236 @@ public abstract class ReadTableHead extends RExternalBuiltinNode.Arg7 {
                     String quote, String sep, boolean skipNull) {
         // TODO This is quite incomplete and just uses readLines, which works for some inputs
         try (RConnection openConn = RConnection.fromIndex(con).forceOpen("r")) {
-            List<String> lines = new ArrayList<>(nlines);
-            int totalLines = 0;
-            while (totalLines < nlines) {
-                String[] readLines = openConn.readLines(nlines - totalLines, EnumSet.of(ReadLineWarning.EMBEDDED_NUL, ReadLineWarning.INCOMPLETE_LAST_LINE), skipNull);
-                if (readLines.length == 0) {
-                    break;
+            ReadState readState = new ReadState(openConn, nlines,
+                            (commentChar.length() > 0) ? commentChar.charAt(0) : -1, quote, skipNull);
+            int sepChar = (sep.length() > 0) ? sep.charAt(0) : -1;
+            int quoteChar = -1;
+            StringBuilder lineBuilder = new StringBuilder();
+            while (readState.resultLines.size() < nlines) {
+                boolean empty = true;
+                boolean skip = false;
+                boolean firstNonWhite = true;
+                int c;
+                while ((c = readState.nextChar()) != -1) {
+                    if (quoteChar != -1) {
+                        if (sepChar != -1 && c == '\\') {
+                            lineBuilder.append('\\');
+                            c = readState.nextChar();
+                            if (c == -1) {
+                                error(Message.EOF_AFTER_BACKSLASH);
+                            }
+                            lineBuilder.append((char) c);
+                            continue;
+                        } else if (c == quoteChar) {
+                            if (sepChar == -1) {
+                                quoteChar = -1;
+                            } else {
+                                int c2 = readState.nextChar();
+                                if (c2 == quoteChar) {
+                                    lineBuilder.append((char) c);
+                                } else {
+                                    readState.pushBack(c2);
+                                    quoteChar = -1;
+                                }
+                            }
+                        }
+                    } else if (!skip && firstNonWhite && quote.indexOf((char) c) != -1) {
+                        quoteChar = c;
+                    } else if (Character.isWhitespace((char) c) || c == sepChar) {
+                        firstNonWhite = true;
+                    } else {
+                        firstNonWhite = false;
+                    }
+                    if (empty && !skip && c != '\n' && c != readState.commentChar) {
+                        empty = false;
+                    }
+                    if (quoteChar == -1 && !skip && c == readState.commentChar) {
+                        skip = true;
+                    }
+                    if (quoteChar != -1 || c != '\n') {
+                        lineBuilder.append((char) c);
+                    } else {
+                        break;
+                    }
                 }
-
-                for (int i = 0; i < readLines.length; i++) {
-                    postprocessLine(lines, readLines[i], commentChar, blankLinesSkip, quote, sep);
+                if (!empty || (c != -1 && !blankLinesSkip)) {
+                    readState.addResultLine(lineBuilder);
+                }
+                if (c == -1) {
+                    break;
                 }
-                totalLines += lines.size();
             }
-
-            return RDataFactory.createStringVector(lines.toArray(new String[0]), RDataFactory.COMPLETE_VECTOR);
+            return RDataFactory.createStringVector(readState.resultLines.toArray(new String[0]), RDataFactory.COMPLETE_VECTOR);
         } catch (IOException ex) {
             throw error(RError.Message.ERROR_READING_CONNECTION, ex.getMessage());
         }
     }
 
-    private static void postprocessLine(List<String> lines, String string, String commentChar, boolean blankLinesSkip, @SuppressWarnings("unused") String quote,
-                    @SuppressWarnings("unused") String sep) {
-        // TODO quote, sep
-        if (blankLinesSkip && string.isEmpty()) {
-            return;
+    private final class ReadState {
+
+        private final RConnection openConn;
+
+        private final int nlines;
+
+        final int commentChar;
+
+        private final String quotes;
+
+        private final boolean skipNull;
+
+        final List<String> resultLines;
+
+        private String[] readLines;
+
+        private int readLineIndex;
+
+        private int colIndex;
+
+        private int pushBackChar = -1;
+
+        private boolean inQuote;
+
+        private int nullCnt;
+
+        ReadState(RConnection openConn, int nlines, int commentChar, String quotes, boolean skipNull) {
+            this.openConn = openConn;
+            this.nlines = nlines;
+            this.commentChar = commentChar;
+            this.quotes = quotes;
+            this.skipNull = skipNull;
+            this.resultLines = new ArrayList<String>(nlines);
         }
 
-        if (commentChar != null && !commentChar.isEmpty() && string.startsWith(commentChar)) {
-            return;
+        int nextChar() throws IOException {
+            int c;
+            if (pushBackChar != -1) {
+                c = pushBackChar;
+                if (c == 0) {
+                    nullCnt++;
+                }
+                pushBackChar = -1;
+            } else {
+                c = readChar();
+            }
+            if (!inQuote && commentChar != -1 && c == commentChar) {
+                do {
+                    c = readChar();
+                } while (c != -1 && c != '\n');
+            }
+            // The allowEscapes flag is currently not propagated into table head reading which
+            // causes e.g. a header line duplication - see tests
+            if (false && c == '\\') { // Assuming escapes never allowed
+                c = readChar();
+                if ('0' <= c && c <= '8') {
+                    int octal = c - '0';
+                    if ('0' <= (c = readChar()) && c <= '8') {
+                        octal = 8 * octal + c - '0';
+                        if ('0' <= (c = readChar()) && c <= '8') {
+                            octal = 8 * octal + c - '0';
+                        } else {
+                            pushBack(c);
+                        }
+                    } else {
+                        pushBack(c);
+                    }
+                    c = octal;
+                } else if (c == -1) {
+                    c = '\\';
+                } else {
+                    switch ((char) c) {
+                        case 'a':
+                            c = '\u0007';
+                            break;
+                        case 'b':
+                            c = '\b';
+                            break;
+                        case 'f':
+                            c = '\f';
+                            break;
+                        case 'n':
+                            c = '\n';
+                            break;
+                        case 'r':
+                            c = '\r';
+                            break;
+                        case 't':
+                            c = '\t';
+                            break;
+                        case 'v':
+                            c = '\u000B';
+                            break;
+                        case 'x':
+                            int hex = 0;
+                            int ext;
+                            for (int i = 0; i < 2; i++) {
+                                c = readChar();
+                                if (c >= '0' && c <= '9') {
+                                    ext = c - '0';
+                                } else if (c >= 'A' && c <= 'F') {
+                                    ext = c - 'A' + 10;
+                                } else if (c >= 'a' && c <= 'f') {
+                                    ext = c - 'a' + 10;
+                                } else {
+                                    pushBack(c);
+                                    break;
+                                }
+                                hex = 16 * hex + ext;
+                            }
+                            c = hex;
+                            break;
+                        default:
+                            if (inQuote && quotes.indexOf(c) != -1) {
+                                pushBack(c);
+                                c = '\\';
+                            }
+                            break;
+                    }
+                }
+            }
+            return c;
+        }
+
+        int readChar() throws IOException {
+            int c;
+            do {
+                if (readLines == null || readLineIndex >= readLines.length) {
+                    readLineIndex = 0;
+                    readLines = openConn.readLines(nlines - resultLines.size(), EnumSet.of(ReadLineWarning.INCOMPLETE_LAST_LINE), false);
+                    if (readLines == null || readLines.length == 0) {
+                        return -1;
+                    }
+                }
+                String readLine = readLines[readLineIndex];
+                if (colIndex < readLine.length()) {
+                    c = readLine.charAt(colIndex++);
+                } else {
+                    c = '\n';
+                    readLineIndex++;
+                    colIndex = 0;
+                }
+            } while (c == 0 && skipNull);
+            if (c == 0) {
+                nullCnt++;
+            }
+            return c;
+        }
+
+        void pushBack(int c) {
+            pushBackChar = c;
+            if (c == 0) {
+                nullCnt--;
+            }
+        }
+
+        void addResultLine(StringBuilder lineBuilder) {
+            resultLines.add(lineBuilder.toString());
+            lineBuilder.setLength(0);
+            if (nullCnt > 0) {
+                nullCnt = 0;
+                warning(Message.LINE_CONTAINS_EMBEDDED_NULLS, resultLines.size());
+            }
         }
 
-        // no reason why not to add
-        lines.add(string);
     }
+
 }
diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java
index a4cc7250eb..531d285ffd 100644
--- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java
+++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java
@@ -945,7 +945,8 @@ public final class RError extends RuntimeException implements TruffleException {
         ATOMIC_VECTOR_ARGUMENTS_ONLY("atomic vector arguments only"),
         MUST_BE_COMPLEX_MATRIX("'%s' must be a complex matrix"),
         INVALID_FORMAL_ARG_LIST("invalid formal argument list for \"%s\""),
-        SINGULAR_BACKSOLVE("singular matrix in 'backsolve'. First zero in diagonal [%d]");
+        SINGULAR_BACKSOLVE("singular matrix in 'backsolve'. First zero in diagonal [%d]"),
+        EOF_AFTER_BACKSLASH("\\ followed by EOF");
 
         public final String message;
         final boolean hasArgs;
diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test
index 4c72153998..38955f0d5d 100644
--- a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test
+++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/ExpectedTestOutput.test
@@ -169187,6 +169187,85 @@ debug at #1: cat("foo exit\n")
 foo exit
 exiting from: foo(5)
 
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, allowEscapes=FALSE, text = 'a b c\\nx\n1 2 3\n4 5 6\n')
+  a b c.nx
+1 1 2    3
+2 4 5    6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#Ignored.ReferenceError#
+#read.table(header = TRUE, allowEscapes=TRUE, text = 'a b c\\nx y z\n1 2 3\n4 5 6\n')
+  a b c
+1 a b c
+2 x y z
+3 1 2 3
+4 4 5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, blank.lines.skip=FALSE, text = 'a b c\n\n1 2 3\n4 5 6')
+   a  b  c
+1 NA NA NA
+2  1  2  3
+3  4  5  6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, blank.lines.skip=TRUE, text = 'a b c\n\n1 2 3\n4 5 6')
+  a b c
+1 1 2 3
+2 4 5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, comment.char='/', text = 'a b c /comment\n1 2 3\n4 5 6\n')
+  a b c
+1 1 2 3
+2 4 5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, comment.char='/', text = 'a b c/"quotes-in-comment"\n1 2 3\n4 5 6\n')
+  a b c
+1 1 2 3
+2 4 5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, quote='|', text = 'a b c\n1 2 3||x')
+  a b    c
+1 1 2 3||x
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, quote='|', text = 'a b c\n1 2|x 3')
+  a   b c
+1 1 2|x 3
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, quote='|', text = 'a b c\n1 |2 22 222| 3\n4 5 6\n')
+  a        b c
+1 1 2 22 222 3
+2 4        5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, quote='|', text = 'a b |c ccc|\n1 2 3\n4 5 6\n')
+  a b c.ccc
+1 1 2     3
+2 4 5     6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, text = 'a b "c ccc"\n1 2 3\n4 5 6\n')
+  a b c.ccc
+1 1 2     3
+2 4 5     6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, text = 'a b c\n1 "2 22 222" 3\n4 5 6\n')
+  a        b c
+1 1 2 22 222 3
+2 4        5 6
+
+##com.oracle.truffle.r.test.library.utils.TestReadTable.testParams#
+#read.table(header = TRUE, text = 'a b\n1 2\n3 4\n')
+  a b
+1 1 2
+2 3 4
+
 ##com.oracle.truffle.r.test.library.utils.TestTrace.testCondTrace#
 #f <- function(x) {}; (if (exists('.fastr.trace')) .fastr.trace else trace)(f, tracer=quote(if (x == 3 || x == 7) print(x))); g <- function() for (i in 1:10) f(i); g()
 [1] "f"
diff --git a/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/library/utils/TestReadTable.java b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/library/utils/TestReadTable.java
new file mode 100644
index 0000000000..b22a5f07e4
--- /dev/null
+++ b/com.oracle.truffle.r.test/src/com/oracle/truffle/r/test/library/utils/TestReadTable.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.truffle.r.test.library.utils;
+
+import com.oracle.truffle.r.test.TestBase;
+import org.junit.Test;
+
+public class TestReadTable extends TestBase {
+
+    @Test
+    public void testParams() {
+        assertEval("read.table(header = TRUE, comment.char='/', text = 'a b c /comment\\n1 2 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, text = 'a b\\n1 2\\n3 4\\n')");
+        assertEval("read.table(header = TRUE, allowEscapes=FALSE, text = 'a b c\\\\nx\\n1 2 3\\n4 5 6\\n')");
+        // GnuR 3.2.3 duplicates header as a first table row (pushback treating extra line
+        // incorrectly)
+        assertEval(Ignored.ReferenceError, "read.table(header = TRUE, allowEscapes=TRUE, text = 'a b c\\\\nx y z\\n1 2 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, text = 'a b c\\n1 \"2 22 222\" 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, text = 'a b \"c ccc\"\\n1 2 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, quote='|', text = 'a b c\\n1 |2 22 222| 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, quote='|', text = 'a b |c ccc|\\n1 2 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, quote='|', text = 'a b c\\n1 2|x 3')");
+        assertEval("read.table(header = TRUE, quote='|', text = 'a b c\\n1 2 3||x')");
+        assertEval("read.table(header = TRUE, comment.char='/', text = 'a b c/\"quotes-in-comment\"\\n1 2 3\\n4 5 6\\n')");
+        assertEval("read.table(header = TRUE, blank.lines.skip=TRUE, text = 'a b c\\n\\n1 2 3\\n4 5 6')");
+        assertEval("read.table(header = TRUE, blank.lines.skip=FALSE, text = 'a b c\\n\\n1 2 3\\n4 5 6')");
+    }
+
+}
-- 
GitLab