package edu.washington.escience.myria.operator; import java.io.BufferedReader; import java.io.DataInput; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Scanner; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; /** * Parse NChilada file formats. See <a * href="http://librarian.phys.washington.edu/astro/index.php/Research:NChilada_File_Format">NChilada wiki</a> * * @author leelee * */ public class NChiladaFileScan extends LeafOperator { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** IOrder attribute that exists in all three types of particles. */ private static final String IORD = "iord"; /** Den attribute that exists in all three types of particles. */ private static final String DEN = "den"; /** Mass attribute that exists in all three types of particles. */ private static final String MASS = "mass"; /** Position x attribute that exists in all three types of particles. */ private static final String POS_X = "x"; /** Position y attribute that exists in all three types of particles. */ private static final String POS_Y = "y"; /** Position z attribute that exists in all three types of particles. */ private static final String POS_Z = "z"; /** Position attribute file name. */ private static final String POS_FILE_NAME = "pos"; /** Pot attribute that exists in all three types of particles. */ private static final String POT = "pot"; /** Smoothlength attribute that exists in all three types of particles. */ private static final String SMOOTHLENGTH = "smoothlength"; /** Soft attribute that exists in all three types of particles. */ private static final String SOFT = "soft"; /** Velocity x attribute that exists in all three types of particles. */ private static final String VEL_X = "vx"; /** Velocity y attribute that exists in all three types of particles. */ private static final String VEL_Y = "vy"; /** Velocity z attribute that exists in all three types of particles. */ private static final String VEL_Z = "vz"; /** Position attribute file name. */ private static final String VEL_FILE_NAME = "vel"; /** Gas iOrder attribute that only exists in star particles. */ private static final String IGASORD = "igasord"; /** Massform attribute that only exists in star particles. */ private static final String MASSFORM = "massform"; /** Tform attribute that only exists in star particles. */ private static final String TFORM = "tform"; /** ESNRate attribute that only exists in gas and star particles. */ private static final String ESN_RATE = "ESNRate"; /** FeMassFrac attribute that only exists in gas and star particles. */ private static final String FE_MASS_FRAC = "FeMassFrac"; /** OxMassFrac attribute that only exists in gas and star particles. */ private static final String OX_MASS_FRAC = "OxMassFrac"; /** Metals attribute that only exists in gas and star particles. */ private static final String METALS = "metals"; /** FeMassFracDot attribute that only exists in gas particles. */ private static final String FE_MASS_FRACDOT = "FeMassFracDot"; /** GasDensity attribute that only exists in gas particles. */ private static final String GAS_DENSITY = "GasDensity"; /** HI attribute that only exists in gas particles. */ private static final String H_I = "HI"; /** HeI attribute that only exists in gas particles. */ private static final String HE_I = "HeI"; /** HeII attribute that only exists in gas particles. */ private static final String HE_I_I = "HeII"; /** Metalsdot attribute that only exists in gas particles. */ private static final String METALSDOT = "Metalsdot"; /** OxMassFracdot attribute that only exists in gas particles. */ private static final String OX_MASS_FRACDOT = "OxMassFracdot"; /** Coolontime attribute that only exists in gas particles. */ private static final String COOLONTIME = "coolontime"; /** Temperature attribute that only exists in gas particles. */ private static final String TEMPERATURE = "temperature"; /** The star directory name. */ private static final String STAR_DIR = "/star"; /** The dark directory name. */ private static final String DARK_DIR = "/dark"; /** The gas directory name. */ private static final String GAS_DIR = "/gas"; /** The column types for NChilada schema. */ private static final List<Type> NCHILADA_COLUMN_TYPES = ImmutableList.of( Type.INT_TYPE, Type.INT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.FLOAT_TYPE, Type.INT_TYPE, Type.STRING_TYPE); /** The column names for NChilada schema. */ private static final List<String> NCHILADA_COLUMN_NAMES = ImmutableList.of( IORD, IGASORD, ESN_RATE, FE_MASS_FRAC, FE_MASS_FRACDOT, GAS_DENSITY, H_I, HE_I, HE_I_I, METALSDOT, OX_MASS_FRAC, OX_MASS_FRACDOT, COOLONTIME, DEN, MASS, METALS, POS_X, POS_Y, POS_Z, POT, SMOOTHLENGTH, SOFT, VEL_X, VEL_Y, VEL_Z, MASSFORM, TFORM, TEMPERATURE, "grp", "type"); /** Schema for all NChilada files. */ private static final Schema NCHILADA_SCHEMA = new Schema(NCHILADA_COLUMN_TYPES, NCHILADA_COLUMN_NAMES); /** The magic number that indicates the file format is NChilada. */ private static final int NCHILADA_FORMAT = 1062053; /** The number of dimension that vel and pos in NChilada file format should have. */ private static final int VEL_POS_DIM = 3; /** The code that indicates the data type of the file is float. */ private static final int FLOAT_CODE = 9; /** The code that indicates the data type of the file is int. */ private static final int INT_CODE = 5; /** Holds the tuples that are ready for release. */ private transient TupleBatchBuffer buffer; /** The full path of the directory that contains star, dark and gas directories. */ private String particleDirectoryPath; /** The full path of the file that contains groupNumber in the order of gas, dark, star. */ private String groupFilePath; /** The group input stream. */ private InputStream groupInputStream; /** Contains matching from file name to DataInput object for star particles attributes. */ private Map<String, DataInput> starAttributeFilesToDataInput; /** Contains matching from file name to DataInput object for gas particles attributes. */ private Map<String, DataInput> gasAttributeFilesToDataInput; /** Contains matching from file name to DataInput object for dark particles attributes. */ private Map<String, DataInput> darkAttributeFilesToDataInput; /** The number of star particle records. */ private int numStar; /** The number of gas particle records. */ private int numGas; /** The number of dark particle records. */ private int numDark; /** Scanner used to parse the group number file. */ private transient Scanner groupScanner; /** Which line of the file the scanner is currently on. */ private int lineNumber; /** * Represents different types of particle. */ private enum ParticleType { /** There are three types of particles. */ DARK, GAS, STAR } /** * Construct a new NChiladaFileScanObject. * * @param particleDirectoryPath The full path of the directory that contains gas, star, dark directories. * @param groupFilePath The full path of the file that contains groupNumber in the order of gas, dark, star. */ public NChiladaFileScan(final String particleDirectoryPath, final String groupFilePath) { Objects.requireNonNull(particleDirectoryPath); Objects.requireNonNull(groupFilePath); this.particleDirectoryPath = particleDirectoryPath; this.groupFilePath = groupFilePath; } /** * Construct a new NChiladaFileScanObject. This constructor is only meant to be called from test. * * @param groupInputStream The InputStream object for group. * @param gasAttributeFilesToDataInput A mapping from gas attribute file names to their respective DataInput object. * @param starAttributeFilesToDataInput A mapping from star attribute file names to their respective DataInput object. * @param darkAttributeFilesToDataInput A mapping from dark attribute file names to their respective DataInput object. */ @SuppressWarnings("unused") // used via reflection in the tests private NChiladaFileScan( final InputStream groupInputStream, final Map<String, DataInput> gasAttributeFilesToDataInput, final Map<String, DataInput> starAttributeFilesToDataInput, final Map<String, DataInput> darkAttributeFilesToDataInput) { Objects.requireNonNull(groupInputStream); Objects.requireNonNull(gasAttributeFilesToDataInput); Objects.requireNonNull(starAttributeFilesToDataInput); Objects.requireNonNull(darkAttributeFilesToDataInput); this.darkAttributeFilesToDataInput = darkAttributeFilesToDataInput; this.gasAttributeFilesToDataInput = gasAttributeFilesToDataInput; this.starAttributeFilesToDataInput = starAttributeFilesToDataInput; this.groupInputStream = groupInputStream; } @Override protected TupleBatch fetchNextReady() throws Exception { processFile(ParticleType.GAS); processFile(ParticleType.DARK); processFile(ParticleType.STAR); return buffer.popAny(); } /** * Create InputStream object for the given group file path. * * @param groupFilePath The file path to create InputStream object. * @return the group InputStream object. * @throws DbException The DbException. */ private InputStream getGroupFileStream(final String groupFilePath) throws DbException { InputStream groupInputStreamLocal; try { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(groupFilePath), conf); Path rootPath = new Path(groupFilePath); groupInputStreamLocal = fs.open(rootPath); } catch (IOException e) { throw new DbException(e); } return groupInputStreamLocal; } /** * Create DataInput object for every files in the directory. * * @param path The directory path. * @return a mapping from filename to DataInput object. * @throws DbException The DbException. */ private Map<String, DataInput> getFilesToDataInput(final String path) throws DbException { Configuration conf = new Configuration(); FileSystem fs; Map<String, DataInput> map = new HashMap<>(); try { fs = FileSystem.get(URI.create(path), conf); Path rootPath = new Path(path + File.separator); FileStatus[] statii = fs.listStatus(rootPath); if (statii == null || statii.length == 0) { throw new FileNotFoundException(path); } for (FileStatus status : statii) { Path p = status.getPath(); String[] pNameTokens = p.getName().split(Pattern.quote(File.separator)); String fileName = pNameTokens[pNameTokens.length - 1]; DataInput dataInputStream = fs.open(p); map.put(fileName, dataInputStream); } } catch (IOException e) { throw new DbException(e); } return map; } @Override protected final void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { numDark = -1; numGas = -1; numStar = -1; if (darkAttributeFilesToDataInput == null) { darkAttributeFilesToDataInput = getFilesToDataInput(particleDirectoryPath + DARK_DIR); } if (gasAttributeFilesToDataInput == null) { gasAttributeFilesToDataInput = getFilesToDataInput(particleDirectoryPath + GAS_DIR); } if (starAttributeFilesToDataInput == null) { starAttributeFilesToDataInput = getFilesToDataInput(particleDirectoryPath + STAR_DIR); } Preconditions.checkArgument( darkAttributeFilesToDataInput != null, "darkAttributeFilesToDataInput has not been set"); Preconditions.checkArgument( starAttributeFilesToDataInput != null, "starAttributeFilesToDataInput has not been set"); Preconditions.checkArgument( gasAttributeFilesToDataInput != null, "gasAttributeFilesToDataInput has not been set"); buffer = new TupleBatchBuffer(getSchema()); initBasedOnParticleType(ParticleType.GAS); initBasedOnParticleType(ParticleType.DARK); initBasedOnParticleType(ParticleType.STAR); if (groupInputStream == null) { groupInputStream = getGroupFileStream(groupFilePath); } Preconditions.checkArgument( groupInputStream != null, "FileScan group input stream has not been set."); groupScanner = new Scanner(new BufferedReader(new InputStreamReader(groupInputStream))); int numGroup = groupScanner.nextInt(); int numTot = numGas + numDark + numStar; if (numGroup != numTot) { throw new DbException( "Number of group is different from the number of particles. numGroup: " + numGroup + " num particles: " + numTot); } lineNumber = 0; } /** * Initialize fileNamesToDataInput and number of particles based on the given type of particles. * * @param pType The type of the particles. * @throws DbException The DbException. */ private void initBasedOnParticleType(final ParticleType pType) throws DbException { int numRows; Map<String, DataInput> fileNameToDataInput; switch (pType) { case GAS: numRows = numGas; fileNameToDataInput = gasAttributeFilesToDataInput; break; case DARK: numRows = numDark; fileNameToDataInput = darkAttributeFilesToDataInput; break; case STAR: numRows = numStar; fileNameToDataInput = starAttributeFilesToDataInput; break; default: throw new DbException("Invalide pType: " + pType); } try { for (String fileName : fileNameToDataInput.keySet()) { DataInput dataInputStream = fileNameToDataInput.get(fileName); // Read header of the file. (magic, time, iHighWord, nbodies, ndim, code) Preconditions.checkArgument( dataInputStream.readInt() == NCHILADA_FORMAT, fileName + " is not in NChilada format."); // Read and verify magic. // Time. dataInputStream.readDouble(); // IHighWord. dataInputStream.readInt(); // Nbodies. int nbodies = dataInputStream.readInt(); // Ndim; int ndim = dataInputStream.readInt(); if (fileName.equals(POS_FILE_NAME) || fileName.equals(VEL_FILE_NAME)) { Preconditions.checkArgument( ndim == VEL_POS_DIM, fileName + "should have " + VEL_POS_DIM + " instead of " + ndim + "."); } if (numRows == -1) { numRows = nbodies; // Update number of particles according to the type. switch (pType) { case DARK: numDark = numRows; break; case GAS: numGas = numRows; break; case STAR: numStar = numRows; break; default: throw new DbException("Invalide pType: " + pType); } } else { Preconditions.checkArgument( numRows == nbodies, "The files do not have the same number of rows. numRows: " + numRows + " nbodies: " + nbodies + " fileName: " + fileName); } // Code. int code = dataInputStream.readInt(); Preconditions.checkArgument( code == FLOAT_CODE || code == INT_CODE, "This code format: " + code + " is not being expected."); // After the header, there is the maximum and minimum value in the file, both in the same data type as the rest // of the file. if (code == FLOAT_CODE) { // Max value. dataInputStream.readFloat(); // Min value. dataInputStream.readFloat(); } else { // Max value. dataInputStream.readInt(); // Min value. dataInputStream.readInt(); } } } catch (FileNotFoundException e) { throw new DbException(e); } catch (IOException e) { throw new DbException(e); } } /** * Constructs tuples of particles. Attributes of star particle: den, pos, pot, vel, iord, mass, OxMassFrac, soft, * smoothlength, tform, ESNRate, massform, metals, igasord, FeMassFrac. Attributes of gas particle: HI, HeI, den, pos, * pot , vel, HeII, iord, mass, OxMassFrac, soft, OxMassFracdot, gas smoothlength, FeMassFracdot, ESNRate, Metalsdot, * GasDensity, metals, temperature, FeMassFrac, coolontime. Attributes of dark particle: den, pos, pot, vel, iord, * mass, soft, smoothlength. * * @param pType The particle type. * @throws DbException The DbException. */ private void processFile(final ParticleType pType) throws DbException { int numRows; Map<String, DataInput> fileNameToDataInput; switch (pType) { case DARK: numRows = numDark; fileNameToDataInput = darkAttributeFilesToDataInput; break; case GAS: numRows = numGas; fileNameToDataInput = gasAttributeFilesToDataInput; break; case STAR: numRows = numStar; fileNameToDataInput = starAttributeFilesToDataInput; break; default: throw new DbException("Invalide pType: " + pType); } // TODO(leelee): Put 0 for now to replace null values. while (numRows > 0 && buffer.numTuples() < buffer.getBatchSize()) { lineNumber++; int column = 0; // -2 to exclude grp, and type. for (int i = 0; i < NCHILADA_COLUMN_NAMES.size() - 2; i++) { String columnNames = NCHILADA_COLUMN_NAMES.get(i); DataInput dataInputStream = fileNameToDataInput.get(columnNames); Type type = NCHILADA_COLUMN_TYPES.get(i); try { if (type.equals(Type.FLOAT_TYPE)) { if (columnNames.equals(POS_X) || columnNames.equals(POS_Y) || columnNames.equals(POS_Z)) { dataInputStream = fileNameToDataInput.get(POS_FILE_NAME); Preconditions.checkArgument( dataInputStream != null, "Cannot find dataInputStream for " + POS_FILE_NAME); } else if (columnNames.equals(VEL_X) || columnNames.equals(VEL_Y) || columnNames.equals(VEL_Z)) { dataInputStream = fileNameToDataInput.get(VEL_FILE_NAME); Preconditions.checkArgument( dataInputStream != null, "Cannot find dataInputStream for " + VEL_FILE_NAME); } if (dataInputStream != null) { buffer.putFloat(column++, dataInputStream.readFloat()); } else { buffer.putFloat(column++, 0); } } else { if (dataInputStream != null) { buffer.putInt(column++, dataInputStream.readInt()); } else { buffer.putInt(column++, 0); } } } catch (IOException e) { throw new DbException(e); } } buffer.putInt(column++, groupScanner.nextInt()); buffer.putString(column++, pType.toString().toLowerCase()); final String groupRest = groupScanner.nextLine().trim(); if (groupRest.length() > 0) { throw new DbException( "groupFile: Unexpected output at the end of line " + lineNumber + ": " + groupRest); } numRows--; } // Update number of particles according to the type. switch (pType) { case DARK: numDark = numRows; break; case GAS: numGas = numRows; break; case STAR: numStar = numRows; break; default: throw new DbException("Invalide pType: " + pType); } } @Override protected Schema generateSchema() { return NCHILADA_SCHEMA; } @Override protected final void cleanup() throws DbException { groupScanner = null; while (buffer.numTuples() > 0) { buffer.popAny(); } } }