package classifier; import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.classifier.ClassifierResult; import org.apache.mahout.classifier.bayes.Algorithm; import org.apache.mahout.classifier.bayes.BayesAlgorithm; import org.apache.mahout.classifier.bayes.BayesParameters; import org.apache.mahout.classifier.bayes.CBayesAlgorithm; import org.apache.mahout.classifier.bayes.ClassifierContext; import org.apache.mahout.classifier.bayes.Datastore; import org.apache.mahout.classifier.bayes.InMemoryBayesDatastore; import org.apache.mahout.classifier.bayes.InvalidDatastoreException; import org.apache.mahout.common.nlp.NGrams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import classifier.Counter; public class ClassifierMapper extends Mapper<Text, Text, Text, IntWritable> { private Text outKey = new Text(); private static final IntWritable ONE = new IntWritable(1); private int gramSize = 1; private ClassifierContext classifier; private String defaultCategory; private static final Logger log = LoggerFactory.getLogger(ClassifierMapper.class); /** * Parallel Classification * * @param key * The label * @param value * the features (all unique) associated w/ this label * @param context */ public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String docLabel = ""; String userID = key.toString(); List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel(); try { ClassifierResult result; result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]) , defaultCategory); docLabel = result.getLabel(); } catch (InvalidDatastoreException e) { log.error(e.toString(), e); context.getCounter(Counter.FAILDOCS).increment(1); } // key is userID and docLabel outKey.set(userID+"|"+docLabel); context.write(outKey, ONE); } /** * read the model * @throws IOException */ @Override public void setup(Context context) throws IOException{ // get bayes parameters Configuration conf = context.getConfiguration(); BayesParameters params = new BayesParameters(conf.get("bayes.parameters", "")); log.info("Bayes Parameter {}", params.print()); Algorithm algorithm; Datastore datastore; if ("bayes".equalsIgnoreCase(params.get("classifierType"))) { algorithm = new BayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else if ("cbayes".equalsIgnoreCase(params.get("classifierType"))) { algorithm = new CBayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else { throw new IllegalArgumentException( "Unrecognized classifier type: " + params.get("classifierType")); } classifier = new ClassifierContext(algorithm, datastore); try { classifier.initialize(); } catch (InvalidDatastoreException e) { log.error(e.toString(), e); } defaultCategory = params.get("defaultCat"); gramSize = params.getGramSize(); } }