package dima; import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.pig.LoadFunc; import org.apache.pig.ResourceSchema; import org.apache.pig.backend.datastorage.ContainerDescriptor; import org.apache.pig.backend.datastorage.DataStorage; import org.apache.pig.backend.datastorage.ElementDescriptor; import org.apache.pig.backend.hadoop.datastorage.*; import org.apache.pig.builtin.PigStorage; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.io.FileLocalizer; import org.codehaus.jackson.map.util.LRUMap; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.HashSet; import java.util.Scanner; import java.util.Set; /** * Date: 27.03.13 * Time: 15:34 * * @author Johannes Kirschnick * @author Alan Akbik */ public class MatrixMarketStorageWithCounts extends PigStorage { private static final Log log = LogFactory.getLog(MatrixMarketStorageWithCounts.class); public static final String MATRIX_MARKET_MATRIX_HEADER = "%%MatrixMarket matrix coordinate real symmetric"; TupleFactory tupleFactory = TupleFactory.getInstance(); // Date in ISO 8601 Format DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ"); Calendar cal = Calendar.getInstance(); /** * Indicates that we have already written out the header. */ private boolean storingFirstRecord = true; public MatrixMarketStorageWithCounts() { super(" "); } @Override public void putNext(Tuple f) throws IOException { if (storingFirstRecord) { // write out header information super.putNext(tupleFactory.newTuple(MATRIX_MARKET_MATRIX_HEADER)); super.putNext(tupleFactory.newTuple("% MatrixMarket writer, see http://math.nist.gov/MatrixMarket/formats.html")); super.putNext(tupleFactory.newTuple("% Generated on " + df.format(cal.getTime()))); super.putNext(tupleFactory.newTuple("% This ASCII file represents a sparse MxN matrix with L nonzeros")); super.putNext(tupleFactory.newTuple("% M N L | <--- rows, columns, entries")); // we assume it's a sparse matrix // This ASCII file represents a sparse MxN matrix with L nonzeros // we assume the following schema // row:int, column:int, distance:double, MxN, global super.putNext(tupleFactory.newTuple(Joiner.on(" ").skipNulls().join(f.get(4), f.get(4), f.get(5)))); storingFirstRecord = false; } super.putNext(tupleFactory.newTuple(Joiner.on(" ").skipNulls().join(f.get(0), f.get(1), f.get(2)))); } // cached job object private Job job; @Override public void setStoreLocation(String location, Job job) throws IOException { super.setStoreLocation(location, job); this.job = job; } @Override public void checkSchema(ResourceSchema schema) throws IOException { // check that we look like this Preconditions.checkNotNull(schema, "Schema is null"); ResourceSchema.ResourceFieldSchema[] fields = schema.getFields(); Preconditions.checkNotNull(fields, "Schema fields are undefined"); Preconditions.checkArgument(6 == fields.length, "Expecting 6 schema fields but found %s, of type row:int, column:int, distance:double, MxN, global", fields.length); checkStoreKeySchema(fields[0], "row"); checkStoreKeySchema(fields[1], "column"); assertFieldTypeEquals(DataType.DOUBLE, fields[2].getType(), "distance"); super.checkSchema(schema); } private void checkStoreKeySchema(ResourceSchema.ResourceFieldSchema schema, String fieldName) throws IOException { switch (schema.getType()) { case DataType.CHARARRAY: case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: return; } throw new IOException(String.format("Expected %s of type '%s' but found type '%s'", fieldName, "Number", DataType.findTypeName(schema.getType()))); } private static void assertFieldTypeEquals(byte expected, byte observed, String fieldName) throws IOException { if (expected != observed) { throw new IOException(String.format("Expected %s of type '%s' but found type '%s'", fieldName, DataType.findTypeName(expected), DataType.findTypeName(observed))); } } private transient LRUMap<ElementDescriptor, Boolean> lookupCache = new LRUMap<ElementDescriptor, Boolean>(100, 1000); private boolean exists(ElementDescriptor e) throws IOException { if (lookupCache.containsKey(e)) { return lookupCache.get(e); } else { boolean res = e.exists(); lookupCache.put(e, res); return res; } } }