package lia.analysis.nutch; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.QueryFilters; import org.apache.hadoop.conf.Configuration; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import java.io.IOException; import java.io.StringReader; // From chapter 4 public class NutchExample { public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); NutchDocumentAnalyzer analyzer = new NutchDocumentAnalyzer(conf); //1 TokenStream ts = analyzer.tokenStream("content", new StringReader("The quick brown fox...")); int position = 0; while(true) { // 2 Token token = ts.next(); if (token == null) { break; } int increment = token.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); } System.out.print("[" + token.termText() + ":" + token.startOffset() + "->" + token.endOffset() + ":" + token.type() + "] "); } System.out.println(); Query nutchQuery = Query.parse("\"the quick brown\"", conf); // 3 org.apache.lucene.search.Query luceneQuery; luceneQuery = new QueryFilters(conf).filter(nutchQuery); // A System.out.println("Translated: " + luceneQuery); } } /* #1 Custom analyzer #2 Display token details #3 Parse to Nutch's Query #A Create corresponding translated Lucene Query */