From 1f3032e1730aee9c123a8938364f90bf5ae0e10b Mon Sep 17 00:00:00 2001 From: Mick Jordan <mick.jordan@oracle.com> Date: Wed, 23 Sep 2015 15:58:33 -0700 Subject: [PATCH] more compression support for package installation --- .../r/nodes/builtin/base/BasePackage.java | 1 + .../builtin/base/HiddenInternalFunctions.java | 49 ++++-- .../r/nodes/builtin/base/LoadFunctions.java | 76 ++++++++- .../truffle/r/runtime/RCompression.java | 158 +++++++++++++++++- .../com/oracle/truffle/r/runtime/RError.java | 6 +- .../r/runtime/conn/GZIPConnections.java | 48 +++++- .../r/install.cran.packages.R | 6 +- 7 files changed, 309 insertions(+), 35 deletions(-) diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/BasePackage.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/BasePackage.java index c3a5f2e4c4..c41434b4ae 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/BasePackage.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/BasePackage.java @@ -344,6 +344,7 @@ public class BasePackage extends RBuiltinPackage { add(Length.class, LengthNodeGen::create); add(License.class, LicenseNodeGen::create); add(ListBuiltin.class, ListBuiltinNodeGen::create); + add(LoadFunctions.Load.class, LoadFunctionsFactory.LoadNodeGen::create); add(LoadFunctions.LoadFromConn2.class, LoadFunctionsFactory.LoadFromConn2NodeGen::create); add(LocaleFunctions.BindTextDomain.class, LocaleFunctionsFactory.BindTextDomainNodeGen::create); add(LocaleFunctions.Enc2Native.class, LocaleFunctionsFactory.Enc2NativeNodeGen::create); diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/HiddenInternalFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/HiddenInternalFunctions.java index 5b44f6fdeb..8d521d3671 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/HiddenInternalFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/HiddenInternalFunctions.java @@ -365,7 +365,7 @@ public class HiddenInternalFunctions { @TruffleBoundary private RIntVector lazyLoadDBinsertValueInternal(MaterializedFrame frame, Object value, RAbstractStringVector file, byte asciiL, int compression, RFunction hook) { - if (compression != 1) { + if (!(compression == 1 || compression == 3)) { throw RError.error(this, Message.GENERIC, "unsupported compression"); } @@ -378,16 +378,35 @@ public class HiddenInternalFunctions { try { byte[] data = RSerialize.serialize(value, RRuntime.fromLogical(asciiL), false, RSerialize.DEFAULT_VERSION, callHook); - byte[] cdata = new byte[data.length + 20]; - long[] cdatalen = new long[1]; - cdatalen[0] = cdata.length; - int rc = RFFIFactory.getRFFI().getBaseRFFI().compress(cdata, cdatalen, data); - if (rc != 0) { - throw RError.error(this, Message.GENERIC, "zlib uncompress error"); + // See comment in LazyLoadDBFetch for format + int outLen; + int offset; + RCompression.Type type; + byte[] cdata; + if (compression == 1) { + type = RCompression.Type.GZIP; + offset = 4; + outLen = (int) (1.001 * data.length) + 20; + cdata = new byte[outLen]; + boolean rc = RCompression.compress(type, data, cdata); + if (!rc) { + throw RError.error(this, Message.GENERIC, "zlib compress error"); + } + } else if (compression == 3) { + type = RCompression.Type.LZMA; + offset = 5; + outLen = data.length; + cdata = new byte[outLen]; + boolean rc = RCompression.compress(type, data, cdata); + if (!rc) { + throw RError.error(this, Message.GENERIC, "lzma compress error"); + } + } else { + throw RInternalError.shouldNotReachHere(); } int[] intData = new int[2]; - intData[1] = (int) cdatalen[0] + 4; // include outlen - intData[0] = appendFile(file.getDataAt(0), cdata, data.length, (int) cdatalen[0]); + intData[1] = outLen + offset; // include length + type (compression == 3) + intData[0] = appendFile(file.getDataAt(0), cdata, data.length, type); return RDataFactory.createIntVector(intData, RDataFactory.COMPLETE_VECTOR); } catch (Throwable ex) { // Exceptions have been observed that were masked and very hard to find @@ -407,12 +426,11 @@ public class HiddenInternalFunctions { * int in the first four bytes of the data. See {@link LazyLoadDBFetch}. * * @param path path of file - * @param data the compressed data + * @param cdata the compressed data * @param ulen length of uncompressed data - * @param len length of compressed data * @return offset in file of appended data */ - private int appendFile(String path, byte[] data, int ulen, int len) { + private int appendFile(String path, byte[] cdata, int ulen, RCompression.Type type) { File file = new File(path); try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file, true))) { int result = (int) file.length(); @@ -422,7 +440,10 @@ public class HiddenInternalFunctions { byte[] ulenData = new byte[4]; dataLengthBuf.get(ulenData); out.write(ulenData); - out.write(data, 0, len); + if (type == RCompression.Type.LZMA) { + out.write(RCompression.Type.LZMA.typeByte); + } + out.write(cdata); return result; } catch (IOException ex) { throw RError.error(this, Message.GENERIC, "lazyLoadDBinsertValue file append error"); @@ -433,7 +454,7 @@ public class HiddenInternalFunctions { /* * Created as primitive function to avoid incrementing reference count for the argument. - * + * * returns -1 for non-shareable, 0 for private, 1 for temp, 2 for shared and * SHARED_PERMANENT_VAL for permanent shared */ diff --git a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/LoadFunctions.java b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/LoadFunctions.java index 0b8f166b20..55f49aab8d 100644 --- a/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/LoadFunctions.java +++ b/com.oracle.truffle.r.nodes.builtin/src/com/oracle/truffle/r/nodes/builtin/base/LoadFunctions.java @@ -25,6 +25,8 @@ import com.oracle.truffle.r.runtime.env.*; import com.oracle.truffle.r.runtime.env.REnvironment.PutException; import com.oracle.truffle.r.runtime.ops.na.*; +// from src/main/saveload.c + public class LoadFunctions { @RBuiltin(name = "loadFromConn2", kind = RBuiltinKind.INTERNAL, parameterNames = {"con", "envir", "verbose"}) @@ -32,8 +34,6 @@ public class LoadFunctions { private final NACheck naCheck = NACheck.create(); - // from src/main/saveload.c - @Specialization @TruffleBoundary protected RStringVector load(RConnection con, REnvironment envir, @SuppressWarnings("unused") RAbstractLogicalVector verbose) { @@ -76,4 +76,76 @@ public class LoadFunctions { } } } + + @RBuiltin(name = "load", kind = RBuiltinKind.INTERNAL, parameterNames = {"con", "envir"}) + public abstract static class Load extends RInvisibleBuiltinNode { + // now deprecated but still used by some packages + + private static final int R_MAGIC_EMPTY = 999; + private static final int R_MAGIC_CORRUPT = 998; + private static final int R_MAGIC_TOONEW = 997; + private static final int R_MAGIC_ASCII_V1 = 1001; + private static final int R_MAGIC_BINARY_V1 = 1002; + private static final int R_MAGIC_XDR_V1 = 1003; + private static final int R_MAGIC_ASCII_V2 = 2001; + private static final int R_MAGIC_BINARY_V2 = 2002; + private static final int R_MAGIC_XDR_V2 = 2003; + + @Specialization + @TruffleBoundary + protected RStringVector load(RAbstractStringVector fileVec, @SuppressWarnings("unused") REnvironment envir) { + controlVisibility(); + String path = Utils.tildeExpand(fileVec.getDataAt(0)); + try (BufferedInputStream bs = new BufferedInputStream(new FileInputStream(path))) { + int magic = readMagic(bs); + switch (magic) { + case R_MAGIC_EMPTY: + throw RError.error(this, RError.Message.MAGIC_EMPTY); + case R_MAGIC_TOONEW: + throw RError.error(this, RError.Message.MAGIC_TOONEW); + case R_MAGIC_CORRUPT: + throw RError.error(this, RError.Message.MAGIC_CORRUPT); + default: + + } + + } catch (IOException ex) { + throw RError.error(this, RError.Message.FILE_OPEN_ERROR); + } + throw RError.nyi(this, "load"); + } + + private static int readMagic(BufferedInputStream bs) throws IOException { + byte[] buf = new byte[5]; + int count = bs.read(buf, 0, 5); + if (count != 5) { + if (count == 0) { + return R_MAGIC_EMPTY; + } else { + return R_MAGIC_CORRUPT; + } + } + String magic = new String(buf); + switch (magic) { + case "RDA1\n": + return R_MAGIC_ASCII_V1; + case "RDB1\n": + return R_MAGIC_BINARY_V1; + case "RDX1\n": + return R_MAGIC_XDR_V1; + case "RDA2\n": + return R_MAGIC_ASCII_V2; + case "RDB2\n": + return R_MAGIC_BINARY_V2; + case "RDX2\n": + return R_MAGIC_XDR_V2; + default: + if (buf[0] == 'R' && buf[1] == 'D') { + return R_MAGIC_TOONEW; + } else { + return R_MAGIC_CORRUPT; + } + } + } + } } diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RCompression.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RCompression.java index 436641ff91..1edb9f7205 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RCompression.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RCompression.java @@ -26,6 +26,8 @@ import java.io.IOException; import java.io.OutputStream; import java.io.InputStream; import java.lang.ProcessBuilder.Redirect; + +import com.oracle.truffle.r.runtime.conn.GZIPConnections.GZIPRConnection; import com.oracle.truffle.r.runtime.ffi.RFFIFactory; /** @@ -53,6 +55,19 @@ public class RCompression { } return null; } + + /** + * Decode the compression type from the bytes in buf (which must be at least length 5). + */ + public static Type decodeBuf(byte[] buf) { + if (buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h') { + return RCompression.Type.BZIP2; + } else if (buf[0] == (byte) 0xFD && buf[1] == '7' && buf[2] == 'z' && buf[3] == 'X' && buf[4] == 'Z') { + return RCompression.Type.LZMA; + } else { + return RCompression.Type.NONE; + } + } } public static boolean uncompress(Type type, byte[] udata, byte[] cdata) { @@ -72,6 +87,31 @@ public class RCompression { } } + public static boolean compress(Type type, byte[] udata, byte[] cdata) { + switch (type) { + case NONE: + System.arraycopy(udata, 0, cdata, 0, udata.length); + return true; + case GZIP: + return gzipCompress(udata, cdata); + case BZIP2: + throw RInternalError.unimplemented("BZIP2 compression"); + case LZMA: + return lzmaCompress(udata, cdata); + default: + assert false; + return false; + } + + } + + private static boolean gzipCompress(byte[] udata, byte[] cdata) { + long[] cdatalen = new long[1]; + cdatalen[0] = cdata.length; + int rc = RFFIFactory.getRFFI().getBaseRFFI().compress(cdata, cdatalen, udata); + return rc == 0; + } + private static boolean gzipUncompress(byte[] udata, byte[] data) { long[] destlen = new long[1]; destlen[0] = udata.length; @@ -79,6 +119,30 @@ public class RCompression { return rc == 0; } + private static boolean lzmaCompress(byte[] udata, byte[] cdata) { + int rc; + ProcessBuilder pb = new ProcessBuilder("xz", "--compress", "--format=raw", "--lzma2", "--stdout"); + pb.redirectError(Redirect.INHERIT); + try { + Process p = pb.start(); + OutputStream os = p.getOutputStream(); + InputStream is = p.getInputStream(); + ProcessOutputThread readThread = new ProcessOutputThreadFixed(is, cdata); + readThread.start(); + os.write(udata); + os.close(); + rc = p.waitFor(); + if (rc == 0) { + readThread.join(); + return true; + } + } catch (InterruptedException | IOException ex) { + return false; + } + return rc == 0; + + } + private static boolean lzmaUncompress(byte[] udata, byte[] data) { int rc; ProcessBuilder pb = new ProcessBuilder("xz", "--decompress", "--format=raw", "--lzma2", "--stdout"); @@ -87,7 +151,7 @@ public class RCompression { Process p = pb.start(); OutputStream os = p.getOutputStream(); InputStream is = p.getInputStream(); - ProcessOutputThread readThread = new ProcessOutputThread(is, udata); + ProcessOutputThread readThread = new ProcessOutputThreadFixed(is, udata); readThread.start(); os.write(data); os.close(); @@ -99,33 +163,109 @@ public class RCompression { } } } catch (InterruptedException | IOException ex) { - rc = 127; + return false; } return rc == 0; } - private static final class ProcessOutputThread extends Thread { - private byte[] udata; - private InputStream is; - private int totalRead; + /** + * This is used by {@link GZIPRConnection}. + */ + public static byte[] lzmaUncompressFromFile(String path) { + return genericUncompressFromFile(new String[]{"xz", "--decompress", "--lzma2", "--stdout", path}); + } + + public static byte[] bzipUncompressFromFile(String path) { + return genericUncompressFromFile(new String[]{"bzip2", "-dc", path}); + } + + private static byte[] genericUncompressFromFile(String[] command) { + int rc; + ProcessBuilder pb = new ProcessBuilder(command); + pb.redirectError(Redirect.INHERIT); + try { + Process p = pb.start(); + InputStream is = p.getInputStream(); + ProcessOutputThreadVariable readThread = new ProcessOutputThreadVariable(is); + readThread.start(); + rc = p.waitFor(); + if (rc == 0) { + readThread.join(); + return readThread.getData(); + } + } catch (InterruptedException | IOException ex) { + // fall through + } + return null; + + } - private ProcessOutputThread(InputStream is, byte[] udata) { + private abstract static class ProcessOutputThread extends Thread { + protected final InputStream is; + protected int totalRead; + + ProcessOutputThread(InputStream is) { super("XZProcessOutputThread"); this.is = is; - this.udata = udata; + } + + } + + /** + * Reads until the expected length or EOF (which is an error). + */ + private static final class ProcessOutputThreadFixed extends ProcessOutputThread { + protected byte[] data; + + private ProcessOutputThreadFixed(InputStream is, byte[] data) { + super(is); + this.data = data; + } + + @Override + public void run() { + int n; + try { + while (totalRead < data.length && (n = is.read(data, totalRead, data.length - totalRead)) != -1) { + totalRead += n; + } + } catch (IOException ex) { + return; + } + } + } + + /** + * Reads a variable sized amount of data into a growing array. + * + */ + private static final class ProcessOutputThreadVariable extends ProcessOutputThread { + private byte[] data; + + private ProcessOutputThreadVariable(InputStream is) { + super(is); + this.data = new byte[8192]; } @Override public void run() { int n; try { - while (totalRead < udata.length && (n = is.read(udata, totalRead, udata.length - totalRead)) != -1) { + while ((n = is.read(data, totalRead, data.length - totalRead)) != -1) { totalRead += n; + if (totalRead == data.length) { + byte[] udataNew = new byte[data.length * 2]; + System.arraycopy(data, 0, udataNew, 0, data.length); + data = udataNew; + } } } catch (IOException ex) { return; } + } + private byte[] getData() { + return data; } } diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java index 94a2e6a723..8568d3de27 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/RError.java @@ -617,6 +617,7 @@ public final class RError extends RuntimeException { FILE_CANNOT_REMOVE(" cannot remove file '%s'"), FILE_CANNOT_RENAME(" cannot rename file '%s' to '%s'"), FILE_COPY_RECURSIVE_IGNORED("'recursive' will be ignored as 'to' is not a single existing directory"), + FILE_OPEN_ERROR("unable to open file"), DIR_CANNOT_CREATE("cannot create dir '%s'"), IMPOSSIBLE_SUBSTITUTE("substitute result cannot be represented"), PACKAGE_AVAILABLE("'%s' may not be available when loading"), @@ -630,7 +631,10 @@ public final class RError extends RuntimeException { REG_FINALIZER_FIRST("first argument must be environment or external pointer"), REG_FINALIZER_SECOND("second argument must be a function"), REG_FINALIZER_THIRD("third argument must be 'TRUE' or 'FALSE'"), - LAZY_LOAD_DB_CORRUPT("lazy-load database '%s' is corrupt"); + LAZY_LOAD_DB_CORRUPT("lazy-load database '%s' is corrupt"), + MAGIC_EMPTY("restore file may be empty -- no data loaded"), + MAGIC_TOONEW("restore file may be from a newer version of R -- no data loaded"), + MAGIC_CORRUPT("bad restore file magic number (file may be corrupted) -- no data loaded"); public final String message; final boolean hasArgs; diff --git a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/conn/GZIPConnections.java b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/conn/GZIPConnections.java index 2b3e8f1347..2000cc8321 100644 --- a/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/conn/GZIPConnections.java +++ b/com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/conn/GZIPConnections.java @@ -35,7 +35,9 @@ public class GZIPConnections { public static final int GZIP_BUFFER_SIZE = (2 << 20); /** - * Base class for all modes of gzfile connections. + * Base class for all modes of gzfile connections. N.B. gzfile is defined to be able to read + * gzip, bzip, lzma and uncompressed files, which has to be implemented by reading the first few + * bytes of the file and detecting the type of the file. */ public static class GZIPRConnection extends BasePathRConnection { public GZIPRConnection(String path, String modeString) throws IOException { @@ -49,10 +51,33 @@ public class GZIPConnections { switch (getOpenMode().abstractOpenMode) { case Read: case ReadBinary: - try { - delegate = new GZIPInputRConnection(this); - } catch (ZipException ex) { - delegate = new FileConnections.FileReadTextRConnection(this); + RCompression.Type cType = RCompression.Type.NONE; + try (InputStream is = new FileInputStream(path)) { + byte[] buf = new byte[5]; + int count = is.read(buf); + if (count == 5) { + cType = RCompression.Type.decodeBuf(buf); + } + } + switch (cType) { + case NONE: + delegate = new FileConnections.FileReadTextRConnection(this); + break; + case GZIP: + delegate = new GZIPInputRConnection(this); + break; + case LZMA: + /* + * no lzma support in Java. For now we use RCompression to a byte array + * and return a ByteArrayInputStream on that. + */ + byte[] lzmaUdata = RCompression.lzmaUncompressFromFile(path); + delegate = new ByteGZipInputRConnection(this, new ByteArrayInputStream(lzmaUdata)); + break; + case BZIP2: + // ditto + byte[] bzipUdata = RCompression.bzipUncompressFromFile(path); + delegate = new ByteGZipInputRConnection(this, new ByteArrayInputStream(bzipUdata)); } break; case Write: @@ -75,13 +100,18 @@ public class GZIPConnections { } private static class GZIPInputRConnection extends DelegateReadRConnection implements ReadWriteHelper { - private GZIPInputStream inputStream; + private InputStream inputStream; GZIPInputRConnection(GZIPRConnection base) throws IOException { super(base); inputStream = new GZIPInputStream(new FileInputStream(base.path), GZIP_BUFFER_SIZE); } + protected GZIPInputRConnection(GZIPRConnection base, InputStream is) { + super(base); + this.inputStream = is; + } + @Override public String readChar(int nchars, boolean useBytes) throws IOException { return readCharHelper(nchars, inputStream, useBytes); @@ -120,6 +150,12 @@ public class GZIPConnections { } + private static class ByteGZipInputRConnection extends GZIPInputRConnection { + ByteGZipInputRConnection(GZIPRConnection base, ByteArrayInputStream is) { + super(base, is); + } + } + private static class GZIPOutputRConnection extends DelegateWriteRConnection implements ReadWriteHelper { private GZIPOutputStream outputStream; diff --git a/com.oracle.truffle.r.test.cran/r/install.cran.packages.R b/com.oracle.truffle.r.test.cran/r/install.cran.packages.R index a7badc434a..deb6359c9b 100644 --- a/com.oracle.truffle.r.test.cran/r/install.cran.packages.R +++ b/com.oracle.truffle.r.test.cran/r/install.cran.packages.R @@ -75,7 +75,7 @@ create.blacklist.iter <- function(blacklist) { # known to be uninstallable # uses C++ cplusplus <- c("Rcpp", "Segmentor3IsBack", "QUIC", "kernlab", "adaptivetau", "geepack", "caTools", "amap", "rgenoud", "stringi", "rjson", "ars", - "e1071", "aylmer") + "e1071", "aylmer", "cpm") # tcltk tcltk <- c("AnnotLists", "tcltk2", "aplpack") # parser bugs @@ -88,13 +88,13 @@ trufflevisitor.nyi <- c("colorspace") # problems with native code nativeinstall <- c("Rglpk", "overlap", "adimpro", "deSolve") # S4 anything using S4 objects -s4 <- c("matrixStats", "AcceptanceSampling", "biglm", "analyz", "RCurl", "anfis", "aod", "ascii") +s4 <- c("matrixStats", "AcceptanceSampling", "biglm", "analyz", "RCurl", "anfis", "aod", "ascii", "childsds") # graphics graphics <- c("Cairo", "rgl") # incomplete definitions from Rmath.h math <- c("mvtnorm") # serialize -serialize <- c("actuar", "spam", "codetools", "iterators", "apc", "apsrtable", "assertthat") +serialize <- c("actuar", "spam", "codetools", "iterators", "apc", "apsrtable", "assertthat", "citbcmst", "cubfits") # fortran related fortran <- c("appell", "blockmodeling", "clues", "rootSolve", "cts", "bayesQR", "cvplogistic") initial.blacklist <- c(cplusplus, tcltk, parserbug, core, math, trufflevisitor.nyi, nativeinstall, s4, graphics, serialize, fortran) -- GitLab