package edu.washington.escience.myria.operator; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.DataInput; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.util.Objects; import java.util.Scanner; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; /** * Read and merge Tipsy bin file, iOrder ascii file and group number ascii file. * * @author leelee * */ public class TipsyFileScan extends LeafOperator { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** The header size in bytes. */ private static final int H_SIZE = 32; /** The gas record size in bytes. */ private static final int G_SIZE = 48; /** The dark record size in bytes. */ private static final int D_SIZE = 36; /** The star record size in bytes. */ private static final int S_SIZE = 44; /** The data input for bin file. */ private transient DataInput dataInputForBin; /** Scanner used to parse the iOrder file. */ private transient Scanner iOrderScanner = null; /** Scanner used to parse the group number file. */ private transient Scanner grpScanner = null; /** Holds the tuples that are ready for release. */ private transient TupleBatchBuffer buffer; /** The bin file name. */ private final String binFileName; /** The iOrder file name. */ private final String iOrderFileName; /** The group number file name. */ private final String grpFileName; /** The number of gas particle record. */ private long ngas; /** The number of star particle record. */ private long nstar; /** The number of dark particle record. */ private long ndark; /** Which line of the file the scanner is currently on. */ private int lineNumber; /** Schema for all Tipsy files. */ private static final Schema TIPSY_SCHEMA = new Schema( ImmutableList.of( Type.LONG_TYPE, // iOrder Type.FLOAT_TYPE, // mass Type.FLOAT_TYPE, // x Type.FLOAT_TYPE, // y Type.FLOAT_TYPE, // z Type.FLOAT_TYPE, // vx Type.FLOAT_TYPE, // vy Type.FLOAT_TYPE, // vz Type.FLOAT_TYPE, // rho Type.FLOAT_TYPE, // temp Type.FLOAT_TYPE, // hsmooth Type.FLOAT_TYPE, // metals Type.FLOAT_TYPE, // tform Type.FLOAT_TYPE, // eps Type.FLOAT_TYPE, // phi Type.INT_TYPE, // grp Type.STRING_TYPE // type ), ImmutableList.of( "iOrder", "mass", "x", "y", "z", "vx", "vy", "vz", "rho", "temp", "hsmooth", "metals", "tform", "eps", "phi", "grp", "type")); /** * Construct a new TipsyFileScan object using the given binary filename, iOrder filename and group number filename. By * default TipsyFileScan will read the given binary file in big endian format. * * @param binFileName The binary file that contains the data for gas, dark, star particles. * @param iOrderFileName The ascii file that contains the data for iOrder. * @param grpFileName The ascii file that contains the data for group number. */ public TipsyFileScan( final String binFileName, final String iOrderFileName, final String grpFileName) { Objects.requireNonNull(binFileName); Objects.requireNonNull(iOrderFileName); Objects.requireNonNull(grpFileName); this.binFileName = binFileName; this.iOrderFileName = iOrderFileName; this.grpFileName = grpFileName; } @Override protected final TupleBatch fetchNextReady() throws DbException { processGasRecords(); processDarkRecords(); processStarRecords(); return buffer.popAny(); } @Override protected final void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { buffer = new TupleBatchBuffer(getSchema()); InputStream iOrderInputStream = openFileOrUrlInputStream(iOrderFileName); InputStream grpInputStream = openFileOrUrlInputStream(grpFileName); int ntot; try { // Create a fileInputStream for the bin file InputStream fStreamForBin = openFileOrUrlInputStream(binFileName); BufferedInputStream bufferedStreamForBin = new BufferedInputStream(fStreamForBin); dataInputForBin = new DataInputStream(bufferedStreamForBin); dataInputForBin.readDouble(); // time ntot = dataInputForBin.readInt(); dataInputForBin.readInt(); ngas = dataInputForBin.readInt(); ndark = dataInputForBin.readInt(); nstar = dataInputForBin.readInt(); dataInputForBin.readInt(); long proposed = H_SIZE + ngas * G_SIZE + ndark * D_SIZE + nstar * S_SIZE; if (ntot != ngas + ndark + nstar) { throw new DbException("header info incorrect"); } if (fStreamForBin instanceof FileInputStream && proposed != ((FileInputStream) fStreamForBin).getChannel().size()) { throw new DbException("binary file size incorrect"); } } catch (IOException e) { throw new DbException(e); } Preconditions.checkArgument( iOrderInputStream != null, "FileScan iOrder input stream has not been set!"); Preconditions.checkArgument( grpInputStream != null, "FileScan group input stream has not been set!"); Preconditions.checkArgument( dataInputForBin != null, "FileScan binary input stream has not been set!"); iOrderScanner = new Scanner(new BufferedReader(new InputStreamReader(iOrderInputStream))); grpScanner = new Scanner(new BufferedReader(new InputStreamReader(grpInputStream))); int numIOrder = iOrderScanner.nextInt(); int numGrp = grpScanner.nextInt(); if (numIOrder != ntot) { throw new DbException( "number of iOrder " + numIOrder + " is different from the number of tipsy record " + ntot + "."); } if (numGrp != ntot) { throw new DbException("number of group is different from the number of tipsy record."); } lineNumber = 0; } @Override protected final void cleanup() throws DbException { iOrderScanner = null; grpScanner = null; while (buffer.numTuples() > 0) { buffer.popAny(); } } /** * Construct tuples for gas particle records. The expected gas particles schema in the bin file is mass, x, y, z, vx, * vy, vz, rho, temp, hsmooth, metals, phi. Merge the record in the binary file with iOrder and group number and fill * in the each tuple column accordingly. * * @throws DbException if error reading from file. */ private void processGasRecords() throws DbException { while (ngas > 0 && (buffer.numTuples() < buffer.getBatchSize())) { lineNumber++; try { int count = 0; buffer.putLong(count++, iOrderScanner.nextLong()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); /* * TODO(leelee): Should be null for the next two columns. Put 0 for now as TupleBatchBuffer does not support * null value. */ buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putInt(count++, grpScanner.nextInt()); buffer.putString(count++, "gas"); } catch (final IOException e) { throw new DbException(e); } final String iOrderRest = iOrderScanner.nextLine().trim(); if (iOrderRest.length() > 0) { throw new DbException( "iOrderFile: Unexpected output at the end of line " + lineNumber + ": " + iOrderRest); } final String grpRest = grpScanner.nextLine().trim(); if (grpRest.length() > 0) { throw new DbException( "grpFile: Unexpected output at the end of line " + lineNumber + ": " + grpRest); } ngas--; } } /** * Construct tuples for gas particle records. The expected dark particles schema in the bin file is mass, x, y, z, vx, * vy, vz, eps, phi. Merge the record in the binary file with iOrder and group number and fill in the each tuple * column accordingly. * * @throws DbException if error reading from file. */ private void processDarkRecords() throws DbException { while (ndark > 0 && (buffer.numTuples() < buffer.getBatchSize())) { lineNumber++; try { int count = 0; buffer.putLong(count++, iOrderScanner.nextLong()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); /* * TODO(leelee): Should be null for the next five columns. Put 0 for now as TupleBatchBuffer does not support * null value. */ buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putInt(count++, grpScanner.nextInt()); buffer.putString(count++, "dark"); } catch (final IOException e) { throw new DbException(e); } final String iOrderRest = iOrderScanner.nextLine().trim(); if (iOrderRest.length() > 0) { throw new DbException( "iOrderFile: Unexpected output at the end of line " + lineNumber + ": " + iOrderRest); } final String grpRest = grpScanner.nextLine().trim(); if (grpRest.length() > 0) { throw new DbException( "grpFile: Unexpected output at the end of line " + lineNumber + ": " + grpRest); } ndark--; } } /** * Construct tuples for gas particle records. The expected dark particles schema in the bin file is mass, x, y, z, vx, * vy, vz, metals, tform, eps, phi. Merge the record in the binary file with iOrder and group number and fill in the * each tuple column accordingly. * * @throws DbException if error reading from file. */ private void processStarRecords() throws DbException { while (nstar > 0 && (buffer.numTuples() < buffer.getBatchSize())) { lineNumber++; try { int count = 0; buffer.putLong(count++, iOrderScanner.nextLong()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); /* * TODO(leelee): Should be null for the next three columns. Put 0 for now as TupleBatchBuffer does not support * null value. */ buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, 0); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putFloat(count++, dataInputForBin.readFloat()); buffer.putInt(count++, grpScanner.nextInt()); buffer.putString(count++, "star"); } catch (final IOException e) { throw new DbException(e); } final String iOrderRest = iOrderScanner.nextLine().trim(); if (iOrderRest.length() > 0) { throw new DbException( "iOrderFile: Unexpected output at the end of line " + lineNumber + ": " + iOrderRest); } final String grpRest = grpScanner.nextLine().trim(); if (grpRest.length() > 0) { throw new DbException( "grpFile: Unexpected output at the end of line " + lineNumber + ": " + grpRest); } nstar--; } } @Override protected Schema generateSchema() { return TIPSY_SCHEMA; } private static InputStream openFileOrUrlInputStream(String filenameOrUrl) throws DbException { try { URI uri = new URI(filenameOrUrl); if (uri.getScheme() == null) { return openFileInputStream(filenameOrUrl); } else if (uri.getScheme().equals("hdfs")) { return openHdfsInputStream(uri); } else { return uri.toURL().openStream(); } } catch (IllegalArgumentException e) { return openFileInputStream(filenameOrUrl); } catch (URISyntaxException e) { return openFileInputStream(filenameOrUrl); } catch (MalformedURLException e) { return openFileInputStream(filenameOrUrl); } catch (IOException e) { throw new DbException(e); } } private static InputStream openFileInputStream(String filename) throws DbException { try { return new FileInputStream(filename); } catch (FileNotFoundException e) { throw new DbException(e); } } private static InputStream openHdfsInputStream(final URI uri) throws DbException { try { FileSystem fs = FileSystem.get(uri, new Configuration()); Path path = new Path(uri); return fs.open(path); } catch (IOException e) { throw new DbException(e); } } }