package de.danielbasedow.prospecter.core.analysis; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.JsonNodeType; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.inject.Inject; import com.skjegstad.utils.BloomFilter; import com.skjegstad.utils.BloomFilterImpl; import com.skjegstad.utils.FakeBloomFilter; import de.danielbasedow.prospecter.core.Token; import de.danielbasedow.prospecter.core.TokenMapper; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; /** * Thin wrapper around Lucene's org.apache.lucene.analysis.Analyzer */ public abstract class LuceneAnalyzer implements Analyzer { private static final Logger LOGGER = LoggerFactory.getLogger(LuceneAnalyzer.class); protected final TokenMapper tokenMapper; protected org.apache.lucene.analysis.Analyzer luceneAnalyzer; @Inject public LuceneAnalyzer(TokenMapper mapper) { tokenMapper = mapper; } @Override public List<Token> tokenize(String input) throws TokenizerException { return tokenize(input, false); } @Override public List<Token> tokenize(String input, boolean dontGenerateNewIds) throws TokenizerException { List<Token> tokens = new ArrayList<Token>(); try { TokenStream ts = luceneAnalyzer.tokenStream("_", new StringReader(input)); CharTermAttribute cta = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); int termId; while (ts.incrementToken()) { termId = tokenMapper.getTermId(cta.toString(), dontGenerateNewIds); if (termId != 0) { tokens.add(new Token<Integer>(termId)); } } ts.end(); } finally { ts.close(); } } catch (IOException e) { e.printStackTrace(); throw new TokenizerException(); } return tokens; } public static Analyzer make(JsonNode options) { throw new UnsupportedOperationException(); } protected static CharArraySet getStopWords(JsonNode stopWords, CharArraySet defaultStopWords) { CharArraySet stopWordSet = new CharArraySet(Version.LUCENE_4_9, 5, true); if (stopWords != null) { if (stopWords.getNodeType() == JsonNodeType.ARRAY) { for (JsonNode node : stopWords) { if (node != null && node.getNodeType() == JsonNodeType.STRING) { stopWordSet.add(node.asText()); } } } else if (stopWords.getNodeType() == JsonNodeType.STRING && "predefined".equals(stopWords.asText())) { stopWordSet = defaultStopWords; } } return stopWordSet; } protected static BloomFilter<String> getBloomFilter(JsonNode options) { if (options.get("bloomfilter") == null) { LOGGER.info("Using FakeBloomFilter"); return new FakeBloomFilter<String>(); } ObjectNode bfOptions = (ObjectNode) options.get("bloomfilter"); double falsePositiveProbability = bfOptions.get("falsePositiveProbability").asDouble(0.01); int expectedNumberOfElements = bfOptions.get("expectedNumberOfElements").asInt(1000); LOGGER.info("Using BloomFilterImpl with " + String.valueOf(falsePositiveProbability * 100) + "% false probability and " + String.valueOf(expectedNumberOfElements) + " expected number of elements"); return new BloomFilterImpl<String>(falsePositiveProbability, expectedNumberOfElements); } }