/** * Copyright (C) 2013 Isabel Drost-Fromm * * This program is free software; you can redistribute it and/or modify * it under the terms of the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.isabeldrostfromm.sof.naive; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder; import de.isabeldrostfromm.sof.util.Vectors; /** * Vectorisation based on LuceneTextValueEncoder for body, title and tags. * */ public class Vectoriser { /** Cardinality of the vector portion to use for encoding posting bodies. */ public static final int bodyCard = 1000000; /** Cardinality of the vector portion to use for encoding posting titles. */ public static final int titleCard = 1000000; /** Cardinality of the vector portion to use for encoding posting tags. */ private static final int tagCard = 0; /** Number of single double values to encode */ private static final int doubles = 0; public static int getCardinality() { return bodyCard + titleCard + tagCard + doubles; } /** * Turn a document bean into a vector. * @param document the document to turn in a vector. * @return the resulting vector. * */ public Vector vectorise(Document document) { Vector body = luceneEncode(bodyCard, document.getBody()); Vector title = luceneEncode(titleCard, document.getTitle()); //Vector tags = luceneEncode(tagCard, Strings.collectionToCommaDelimitedString(document.getTags())); //Vector reputation = Vectors.newSequentialAccessSparseVector(document.getReputation()); return Vectors.append(body, title);//, tags);//, reputation); } /** * @return Returns a vector generated for the given text based on encoding with LuceneTextValueEncoder * */ private static Vector luceneEncode(int probes, String text) { LuceneTextValueEncoder encoder = new LuceneTextValueEncoder("sof"); encoder.setAnalyzer(new StandardAnalyzer(Version.LUCENE_36)); encoder.setProbes(probes); encoder.addText(text); Vector vector = new SequentialAccessSparseVector(probes); encoder.flush(1, vector); return vector; } }