package ivory.bloomir.data; import java.io.EOFException; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; import ivory.bloomir.util.DocumentUtility; import ivory.core.RetrievalEnvironment; import ivory.core.data.index.Posting; import ivory.core.data.index.PostingsList; import ivory.core.data.index.PostingsReader; import ivory.core.data.stat.SpamPercentileScore; public class SignatureIO { private static final Logger LOGGER = Logger.getLogger(SignatureIO.class); /** * Loads an entire collection of signatures into an array. * * @param path Path to the root of the postings list * @param fs File system * @param signatures Array of {@link Signature} to be initialized. */ public static void loadSignatures(String path, FileSystem fs, Signature[] signatures) throws IOException { FSDataInputStream input = fs.open(new Path(path + "/" + BloomConfig.CONFIG_FILE)); BloomConfig bloomConfig = BloomConfig.readInstance(input); input.close(); FileStatus[] stat = fs.listStatus(new Path(path)); for(int f = 0; f < stat.length; f++) { String name = stat[f].getPath().toString(); name = name.substring(name.lastIndexOf('/') + 1); if(name.equals(BloomConfig.CONFIG_FILE)) { continue; } LOGGER.info("reading block: " + name); input = fs.open(stat[f].getPath()); while(true) { try { int id = input.readInt(); int df = input.readInt(); if(df <= bloomConfig.getIdentityHashThreshold()) { signatures[id] = BloomFilterHash.readInstance(input); } else { signatures[id] = BloomFilterIdentityHash.readInstance(input); } } catch(EOFException ex) { break; } } input.close(); } } /** * Creates Signatures and writes them to disk. * * @param outputPath Root path to store the output in. * @param fs File system * @param env Retireval Environment * @param spamScoresPath Path to spam/quality scores * @param bitsPerElement Number of bits dedicated to one element * @param nbHash Number of hash functions */ public static void writeSignatures(String outputPath, FileSystem fs, RetrievalEnvironment env, String spamScoresPath, int bitsPerElement, int nbHash) throws IOException { SpamPercentileScore spamScores = new SpamPercentileScore(); spamScores.initialize(spamScoresPath, fs); int[] newDocids = DocumentUtility.spamSortDocids(spamScores); int collectionSize = env.readCollectionTermCount(); Posting posting = new Posting(); FSDataOutputStream out; BloomConfig bloomConfig = new BloomConfig((int) env.getDocumentCount(), collectionSize, nbHash, bitsPerElement); //Deletes the output path if it already exists. fs.delete(new Path(outputPath), true); //Serialize and write the configuration parameters. out = fs.create(new Path(outputPath + "/" + BloomConfig.CONFIG_FILE)); bloomConfig.write(out); out.close(); for(int i = 0; i <= collectionSize; i++) { if(i % 100000 == 0) { if(i != 0) { out.close(); } out = fs.create(new Path(outputPath + "/" + i)); } try { PostingsList pl = env.getPostingsList(env.getTermFromId(i)); PostingsReader reader = pl.getPostingsReader(); Signature filter = null; //Decide which filter to use based on the configuration parameters int df = pl.getDf(); if (df <= bloomConfig.getIdentityHashThreshold()) { filter = new BloomFilterHash(df * bloomConfig.getBitsPerElement(), bloomConfig.getHashCount()); } else { filter = new BloomFilterIdentityHash(bloomConfig.getDocumentCount()); } while (reader.nextPosting(posting)) { filter.add(newDocids[posting.getDocno()]); } out.writeInt(i); out.writeInt(df); filter.write(out); } catch(Exception e) { continue; } } if(out != null) { out.close(); } } }