package de.isabeldrostfromm.sof.termvector; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.map.OpenObjectDoubleHashMap; import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder; import de.isabeldrostfromm.sof.util.Vectors; public class Vectoriser { public Vector vectorise(ParsedDocument doc) { Vector body = encode(doc.getBody(), de.isabeldrostfromm.sof.naive.Vectoriser.bodyCard); Vector title = encode(doc.getTitle(), de.isabeldrostfromm.sof.naive.Vectoriser.titleCard); return Vectors.append(body, title); } public Vector encode(OpenObjectDoubleHashMap<String> termVector, int card) { Vector vector = new SequentialAccessSparseVector(card); StaticWordValueEncoder encoder = new StaticWordValueEncoder("name"); // TODO why on earth is the typical keySet missing??? for (String term : termVector.keys()) { for (int i = 0; i < termVector.get(term); i++) encoder.addToVector(term, vector); } return vector; } }