/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.*;
import java.util.*;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
import org.apache.nutch.searcher.Query.*;
/** Construct n-grams for frequently occuring terms and phrases while indexing.
* Optimize phrase queries to use the n-grams. Single terms are still indexed
* too, with n-grams overlaid. This is achieved through the use of {@link
* Token#setPositionIncrement(int)}.*/
public class CommonGrams {
private static final Log LOG = LogFactory.getLog(CommonGrams.class);
private static final char SEPARATOR = '-';
/** The key used to cache commonTerms in Configuration */
private static final String KEY = CommonGrams.class.getName();
private HashMap<String, HashSet<String>> commonTerms =
new HashMap<String, HashSet<String>>();
/**
* The constructor.
* @param conf
*/
public CommonGrams(Configuration conf) {
init(conf);
}
private static class Filter extends TokenFilter {
private HashSet<String> common;
private Token previous;
private LinkedList<Token> gramQueue = new LinkedList<Token>();
private LinkedList<Token> nextQueue = new LinkedList<Token>();
private StringBuffer buffer = new StringBuffer();
/** Construct an n-gram producing filter. */
public Filter(TokenStream input, HashSet<String> common) {
super(input);
this.common = common;
}
/** Inserts n-grams into a token stream. */
public Token next() throws IOException {
if (gramQueue.size() != 0) // consume any queued tokens
return gramQueue.removeFirst();
final Token token = popNext();
if (token == null)
return null;
if (!isCommon(token)) { // optimize simple case
previous = token;
return token;
}
gramQueue.add(token); // queue the token
ListIterator<Token> i = nextQueue.listIterator();
Token gram = token;
while (isCommon(gram)) {
if (previous != null && !isCommon(previous)) // queue prev gram first
gramQueue.addFirst(gramToken(previous, gram));
Token next = peekNext(i);
if (next == null)
break;
gram = gramToken(gram, next); // queue next gram last
gramQueue.addLast(gram);
}
previous = token;
return gramQueue.removeFirst();
}
/** True iff token is for a common term. */
private boolean isCommon(Token token) {
return common != null && common.contains(token.termText());
}
/** Pops nextQueue or, if empty, reads a new token. */
private Token popNext() throws IOException {
if (nextQueue.size() > 0)
return nextQueue.removeFirst();
else
return input.next();
}
/** Return next token in nextQueue, extending it when empty. */
private Token peekNext(ListIterator<Token> i) throws IOException {
if (!i.hasNext()) {
Token next = input.next();
if (next == null)
return null;
i.add(next);
i.previous();
}
return i.next();
}
/** Construct a compound token. */
private Token gramToken(Token first, Token second) {
buffer.setLength(0);
buffer.append(first.termText());
buffer.append(SEPARATOR);
buffer.append(second.termText());
Token result = new Token(buffer.toString(),
first.startOffset(), second.endOffset(),
"gram");
result.setPositionIncrement(0);
return result;
}
}
/** Construct using the provided config file. */
private void init(Configuration conf) {
ObjectCache objectCache = ObjectCache.get(conf);
// First, try to retrieve some commonTerms cached in configuration.
commonTerms = (HashMap<String, HashSet<String>>) objectCache.getObject(KEY);
if (commonTerms != null) { return; }
// Otherwise, read the terms.file
try {
commonTerms = new HashMap<String, HashSet<String>>();
Reader reader = conf.getConfResourceAsReader
(conf.get("analysis.common.terms.file"));
BufferedReader in = new BufferedReader(reader);
String line;
while ((line = in.readLine()) != null) {
line = line.trim();
if (line.startsWith("#") || "".equals(line)) // skip comments
continue;
TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
Token token = ts.next();
if (token == null) {
if (LOG.isWarnEnabled()) {
LOG.warn("Line does not contain a field name: " + line);
}
continue;
}
String field = token.termText();
token = ts.next();
if (token == null) {
if (LOG.isWarnEnabled()) {
LOG.warn("Line contains only a field name, no word: " + line);
}
continue;
}
String gram = token.termText();
while ((token = ts.next()) != null) {
gram = gram + SEPARATOR + token.termText();
}
HashSet<String> table = commonTerms.get(field);
if (table == null) {
table = new HashSet<String>();
commonTerms.put(field, table);
}
table.add(gram);
}
objectCache.setObject(KEY, commonTerms);
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
}
/** Construct a token filter that inserts n-grams for common terms. For use
* while indexing documents. */
public TokenFilter getFilter(TokenStream ts, String field) {
return new Filter(ts, commonTerms.get(field));
}
/** Utility to convert an array of Query.Terms into a token stream. */
private static class ArrayTokens extends TokenStream {
private Term[] terms;
private int index;
public ArrayTokens(Phrase phrase) {
this.terms = phrase.getTerms();
}
public Token next() {
if (index == terms.length)
return null;
else
return new Token(terms[index].toString(), index, ++index);
}
}
/** Optimizes phrase queries to use n-grams when possible. */
public String[] optimizePhrase(Phrase phrase, String field) {
if (LOG.isTraceEnabled()) {
LOG.trace("Optimizing " + phrase + " for " + field);
}
ArrayList<String> result = new ArrayList<String>();
TokenStream ts = getFilter(new ArrayTokens(phrase), field);
Token token, prev=null;
int position = 0;
try {
while ((token = ts.next()) != null) {
if (token.getPositionIncrement() != 0 && prev != null)
result.add(prev.termText());
prev = token;
position += token.getPositionIncrement();
if ((position + arity(token.termText())) == phrase.getTerms().length)
break;
}
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
if (prev != null)
result.add(prev.termText());
return result.toArray(new String[result.size()]);
}
private int arity(String gram) {
int index = 0;
int arity = 0;
while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
arity++;
}
return arity;
}
/** For debugging. */
public static void main(String[] args) throws Exception {
StringBuffer text = new StringBuffer();
for (int i = 0; i < args.length; i++) {
text.append(args[i]);
text.append(' ');
}
TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
ts = commonGrams.getFilter(ts, "url");
Token token;
while ((token = ts.next()) != null) {
System.out.println("Token: " + token);
}
String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
System.out.print("Optimized: ");
for (int i = 0; i < optimized.length; i++) {
System.out.print(optimized[i] + " ");
}
System.out.println();
}
}