package lia.analysis; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import junit.framework.Assert; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import java.io.IOException; import java.io.StringReader; // From chapter 4 public class AnalyzerUtils { public static void displayTokens(Analyzer analyzer, String text) throws IOException { displayTokens(analyzer.tokenStream("contents", new StringReader(text))); //A } public static void displayTokens(TokenStream stream) throws IOException { TermAttribute term = stream.addAttribute(TermAttribute.class); while(stream.incrementToken()) { System.out.print("[" + term.term() + "] "); //B } } /* #A Invoke analysis process #B Print token text surrounded by brackets */ public static int getPositionIncrement(AttributeSource source) { PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class); return attr.getPositionIncrement(); } public static String getTerm(AttributeSource source) { TermAttribute attr = source.addAttribute(TermAttribute.class); return attr.term(); } public static String getType(AttributeSource source) { TypeAttribute attr = source.addAttribute(TypeAttribute.class); return attr.type(); } public static void setPositionIncrement(AttributeSource source, int posIncr) { PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class); attr.setPositionIncrement(posIncr); } public static void setTerm(AttributeSource source, String term) { TermAttribute attr = source.addAttribute(TermAttribute.class); attr.setTermBuffer(term); } public static void setType(AttributeSource source, String type) { TypeAttribute attr = source.addAttribute(TypeAttribute.class); attr.setType(type); } public static void displayTokensWithPositions (Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int position = 0; while(stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); } System.out.print("[" + term.term() + "] "); } System.out.println(); } public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while(stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D } System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); } /* #A Perform analysis #B Obtain attributes of interest #C Iterate through all tokens #D Compute position and print #E Print all token details */ public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); TermAttribute termAttr = stream.addAttribute(TermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.term()); } Assert.assertFalse(stream.incrementToken()); stream.close(); } public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { System.out.println("posIncr=" + posIncr.getPositionIncrement()); } } public static void main(String[] args) throws IOException { System.out.println("SimpleAnalyzer"); displayTokensWithFullDetails(new SimpleAnalyzer(), "The quick brown fox...."); System.out.println("\n----"); System.out.println("StandardAnalyzer"); displayTokensWithFullDetails(new StandardAnalyzer(Version.LUCENE_30), "I'll email you at xyz@example.com"); } } /* #1 Invoke analysis process #2 Output token text surrounded by brackets */