/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.analysis; import java.io.*; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Token; /** The tokenizer used for Nutch document text. Implemented in terms of our * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared * with the query parser. */ public final class NutchDocumentTokenizer extends Tokenizer implements NutchAnalysisConstants { private NutchAnalysisTokenManager tokenManager; /** Construct a tokenizer for the text in a Reader. */ public NutchDocumentTokenizer(Reader reader) { super(reader); tokenManager = new NutchAnalysisTokenManager(reader); } /** Returns the next token in the stream, or null at EOF. */ public final Token next() throws IOException { org.apache.nutch.analysis.Token t; try { loop: { while (true) { t = tokenManager.getNextToken(); switch (t.kind) { // skip query syntax tokens case EOF: case WORD: case ACRONYM: case SIGRAM: break loop; default: } } } } catch (TokenMgrError e) { // translate exceptions throw new IOException("Tokenizer error:" + e); } if (t.kind == EOF) // translate tokens return null; else { return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]); } } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Text: "); String line = in.readLine(); Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line)); Token token; System.out.print("Tokens: "); while ((token = tokenizer.next()) != null) { System.out.print(token.termText()); System.out.print(" "); } System.out.println(); } } }