package org.wikipedia.miner.extract.steps.labelOccurrences; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.FileReader; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.Pair; import org.apache.avro.specific.SpecificDatumReader; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.model.struct.LabelSenseList; import org.wikipedia.miner.util.ProgressTracker; import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; public class LabelCache { //this is a singleton, so that it can be retrieved by multiple mappers within the same JVM private static Logger logger = Logger.getLogger(LabelCache.class) ; private static final double falsePositiveProbability = 0.005 ; private static final double desiredLabelPopulation = 0.995 ; private static LabelCache labelCache ; public static LabelCache get() { if (labelCache == null) labelCache = new LabelCache() ; return labelCache ; } BloomFilter<CharSequence> labels ; private int maxLabelLength = 0 ; private int maxSensibleLabelLength = 0 ; List<Integer> lengthHistogram = new ArrayList<Integer>() ; private boolean isLoaded = false ; public boolean isLoaded() { return isLoaded ; } public int getMaxLabelLength() { return maxLabelLength ; } public int getMaxSensibleLabelLength() { return maxSensibleLabelLength ; } public boolean mightContain(CharSequence label) { return labels.mightContain(label) ; } public void load(List<Path> paths, int totalLabels, Reporter reporter) throws IOException { if (isLoaded) return ; logger.info("Caching " + totalLabels + " labels"); Runtime r = Runtime.getRuntime() ; long memBefore = r.totalMemory() ; labels = BloomFilter.create(Funnels.unencodedCharsFunnel(),totalLabels, falsePositiveProbability) ; int labelsInserted = 0 ; ProgressTracker tracker = new ProgressTracker(totalLabels, "Loading labels", getClass()) ; for (Path path:paths) { logger.info("Caching labels from " + path); tracker.update(); File file = new File(path.toString()) ; Schema schema = Pair.getPairSchema(Schema.create(Type.STRING),LabelSenseList.getClassSchema()) ; DatumReader<Pair<CharSequence,LabelSenseList>> datumReader = new SpecificDatumReader<Pair<CharSequence,LabelSenseList>>(schema); FileReader<Pair<CharSequence,LabelSenseList>> fileReader = DataFileReader.openReader(file, datumReader) ; while (fileReader.hasNext()) { Pair<CharSequence,LabelSenseList> pair = fileReader.next(); CharSequence label = pair.key() ; labels.put(label) ; labelsInserted++ ; updateLengthHistogram(label) ; reporter.progress() ; } fileReader.close() ; } long memAfter = r.totalMemory() ; logger.info("Memory Used: " + (memAfter - memBefore) / (1024*1024) + "Mb") ; isLoaded = true ; calculateMaximumSensibleLength(totalLabels) ; logger.info("Longest label: " + maxLabelLength); logger.info("Longest sensible label: " + maxSensibleLabelLength); logger.info("labels expected: " + totalLabels + ", labels inserted: " + labelsInserted); printLengthHistogram() ; } private void updateLengthHistogram(CharSequence label) { Tokenizer tokenizer = SimpleTokenizer.INSTANCE ; String[] tokens = tokenizer.tokenize(label.toString()) ; if (maxLabelLength < tokens.length) maxLabelLength = tokens.length ; while (lengthHistogram.size() <= tokens.length) lengthHistogram.add(0) ; lengthHistogram.set(tokens.length, lengthHistogram.get(tokens.length)+1) ; if (tokens.length > 20) logger.info(" - " + label); } private void calculateMaximumSensibleLength(int totalLabels) { //we can have a few tokens that are extremely long, which will cause the mapper to be very slow. //we will ignore tokens that are longer than 99% of the population int total = 0 ; int length ; for (length = 0; length<lengthHistogram.size(); length++) { total = total + lengthHistogram.get(length) ; double popProportion = (double)total/totalLabels ; if (popProportion > desiredLabelPopulation) break ; } maxSensibleLabelLength = length ; } private void printLengthHistogram() { for (int length=0 ; length<lengthHistogram.size(); length++) logger.info(lengthHistogram.get(length) + " labels with " + length + " tokens"); } }