/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.analysis.util.AbstractAnalysisFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.FilesystemResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; import org.apache.lucene.util.Version; import java.io.StreamTokenizer; import java.io.StringReader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * Analyzer factory construction task. The name given to the constructed factory may * be given to NewAnalyzerTask, which will call AnalyzerFactory.create(). * * Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue'; * use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing * quotation mark, * * Specify params in a comma separated list of the following, in order: * <ol> * <li>Analyzer args: * <ul> * <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code></li> * <li>Optional: <tt>positionIncrementGap:<i>int value</i></tt> (default: 0)</li> * <li>Optional: <tt>offsetGap:<i>int value</i></tt> (default: 1)</li> * </ul> * </li> * <li>zero or more CharFilterFactory's, followed by</li> * <li>exactly one TokenizerFactory, followed by</li> * <li>zero or more TokenFilterFactory's</li> * </ol> * * Each component analysis factory may specify <tt>luceneMatchVersion</tt> (defaults to * {@link Version#LATEST}) and any of the args understood by the specified * *Factory class, in the above-describe param format. * <p> * Example: * <pre> * -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens', * positionIncrementGap:100, * HTMLStripCharFilter, * MappingCharFilter(mapping:'mapping-FoldToASCII.txt'), * WhitespaceTokenizer(luceneMatchVersion:LUCENE_5_0_0), * TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false)) * [...] * -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens') * </pre> * <p> * AnalyzerFactory will direct analysis component factories to look for resources * under the directory specified in the "work.dir" property. */ public class AnalyzerFactoryTask extends PerfTask { private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis."; private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN = Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$"); private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$"); private enum ArgType {ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER } String factoryName = null; Integer positionIncrementGap = null; Integer offsetGap = null; private List<CharFilterFactory> charFilterFactories = new ArrayList<>(); private TokenizerFactory tokenizerFactory = null; private List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>(); public AnalyzerFactoryTask(PerfRunData runData) { super(runData); } @Override public int doLogic() { return 1; } /** * Sets the params. * Analysis component factory names may optionally include the "Factory" suffix. * * @param params analysis pipeline specification: name, (optional) positionIncrementGap, * (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory, * and 0+ TokenFilterFactory's */ @Override @SuppressWarnings("fallthrough") public void setParams(String params) { super.setParams(params); ArgType expectedArgType = ArgType.ANALYZER_ARG; final StreamTokenizer stok = new StreamTokenizer(new StringReader(params)); stok.commentChar('#'); stok.quoteChar('"'); stok.quoteChar('\''); stok.eolIsSignificant(false); stok.ordinaryChar('('); stok.ordinaryChar(')'); stok.ordinaryChar(':'); stok.ordinaryChar(','); try { while (stok.nextToken() != StreamTokenizer.TT_EOF) { switch (stok.ttype) { case ',': { // Do nothing break; } case StreamTokenizer.TT_WORD: { if (expectedArgType.equals(ArgType.ANALYZER_ARG)) { final String argName = stok.sval; if ( ! argName.equalsIgnoreCase("name") && ! argName.equalsIgnoreCase("positionIncrementGap") && ! argName.equalsIgnoreCase("offsetGap")) { throw new RuntimeException ("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'"); } stok.nextToken(); if (stok.ttype != ':') { throw new RuntimeException ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); } stok.nextToken(); String argValue = stok.sval; switch (stok.ttype) { case StreamTokenizer.TT_NUMBER: { argValue = Double.toString(stok.nval); // Drop the ".0" from numbers, for integer arguments argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); // Intentional fallthrough } case '"': case '\'': case StreamTokenizer.TT_WORD: { if (argName.equalsIgnoreCase("name")) { factoryName = argValue; expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER; } else { int intArgValue = 0; try { intArgValue = Integer.parseInt(argValue); } catch (NumberFormatException e) { throw new RuntimeException ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e); } if (argName.equalsIgnoreCase("positionIncrementGap")) { positionIncrementGap = intArgValue; } else if (argName.equalsIgnoreCase("offsetGap")) { offsetGap = intArgValue; } } break; } case StreamTokenizer.TT_EOF: { throw new RuntimeException("Unexpected EOF: " + stok.toString()); } default: { throw new RuntimeException ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); } } } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) { final String argName = stok.sval; if (argName.equalsIgnoreCase("positionIncrementGap") || argName.equalsIgnoreCase("offsetGap")) { stok.nextToken(); if (stok.ttype != ':') { throw new RuntimeException ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); } stok.nextToken(); int intArgValue = (int)stok.nval; switch (stok.ttype) { case '"': case '\'': case StreamTokenizer.TT_WORD: { intArgValue = 0; try { intArgValue = Integer.parseInt(stok.sval.trim()); } catch (NumberFormatException e) { throw new RuntimeException ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e); } // Intentional fall-through } case StreamTokenizer.TT_NUMBER: { if (argName.equalsIgnoreCase("positionIncrementGap")) { positionIncrementGap = intArgValue; } else if (argName.equalsIgnoreCase("offsetGap")) { offsetGap = intArgValue; } break; } case StreamTokenizer.TT_EOF: { throw new RuntimeException("Unexpected EOF: " + stok.toString()); } default: { throw new RuntimeException ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); } } break; } try { final Class<? extends CharFilterFactory> clazz; clazz = lookupAnalysisClass(argName, CharFilterFactory.class); createAnalysisPipelineComponent(stok, clazz); } catch (IllegalArgumentException e) { try { final Class<? extends TokenizerFactory> clazz; clazz = lookupAnalysisClass(argName, TokenizerFactory.class); createAnalysisPipelineComponent(stok, clazz); expectedArgType = ArgType.TOKENFILTER; } catch (IllegalArgumentException e2) { throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + argName + "' as CharFilterFactory or TokenizerFactory"); } } } else { // expectedArgType = ArgType.TOKENFILTER final String className = stok.sval; final Class<? extends TokenFilterFactory> clazz; try { clazz = lookupAnalysisClass(className, TokenFilterFactory.class); } catch (IllegalArgumentException e) { throw new RuntimeException ("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory"); } createAnalysisPipelineComponent(stok, clazz); } break; } default: { throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); } } } } catch (RuntimeException e) { if (e.getMessage().startsWith("Line #")) { throw e; } else { throw new RuntimeException("Line #" + lineno(stok) + ": ", e); } } catch (Throwable t) { throw new RuntimeException("Line #" + lineno(stok) + ": ", t); } final AnalyzerFactory analyzerFactory = new AnalyzerFactory (charFilterFactories, tokenizerFactory, tokenFilterFactories); analyzerFactory.setPositionIncrementGap(positionIncrementGap); analyzerFactory.setOffsetGap(offsetGap); getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory); } /** * Instantiates the given analysis factory class after pulling params from * the given stream tokenizer, then stores the result in the appropriate * pipeline component list. * * @param stok stream tokenizer from which to draw analysis factory params * @param clazz analysis factory class to instantiate */ @SuppressWarnings("fallthrough") private void createAnalysisPipelineComponent (StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) { Map<String,String> argMap = new HashMap<>(); boolean parenthetical = false; try { WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) { switch (stok.ttype) { case ',': { if (parenthetical) { // Do nothing break; } else { // Finished reading this analysis factory configuration break WHILE_LOOP; } } case '(': { if (parenthetical) { throw new RuntimeException ("Line #" + lineno(stok) + ": Unexpected opening parenthesis."); } parenthetical = true; break; } case ')': { if (parenthetical) { parenthetical = false; } else { throw new RuntimeException ("Line #" + lineno(stok) + ": Unexpected closing parenthesis."); } break; } case StreamTokenizer.TT_WORD: { if ( ! parenthetical) { throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'"); } String argName = stok.sval; stok.nextToken(); if (stok.ttype != ':') { throw new RuntimeException ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName()); } stok.nextToken(); String argValue = stok.sval; switch (stok.ttype) { case StreamTokenizer.TT_NUMBER: { argValue = Double.toString(stok.nval); // Drop the ".0" from numbers, for integer arguments argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); // Intentional fall-through } case '"': case '\'': case StreamTokenizer.TT_WORD: { argMap.put(argName, argValue); break; } case StreamTokenizer.TT_EOF: { throw new RuntimeException("Unexpected EOF: " + stok.toString()); } default: { throw new RuntimeException ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); } } } } } if (!argMap.containsKey("luceneMatchVersion")) { argMap.put("luceneMatchVersion", Version.LATEST.toString()); } final AbstractAnalysisFactory instance; try { instance = clazz.getConstructor(Map.class).newInstance(argMap); } catch (Exception e) { throw new RuntimeException("Line #" + lineno(stok) + ": ", e); } if (instance instanceof ResourceLoaderAware) { Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work")); if (!Files.isDirectory(baseDir)) { baseDir = Paths.get("."); } ((ResourceLoaderAware)instance).inform(new FilesystemResourceLoader(baseDir)); } if (CharFilterFactory.class.isAssignableFrom(clazz)) { charFilterFactories.add((CharFilterFactory)instance); } else if (TokenizerFactory.class.isAssignableFrom(clazz)) { tokenizerFactory = (TokenizerFactory)instance; } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) { tokenFilterFactories.add((TokenFilterFactory)instance); } } catch (RuntimeException e) { if (e.getMessage().startsWith("Line #")) { throw (e); } else { throw new RuntimeException("Line #" + lineno(stok) + ": ", e); } } catch (Throwable t) { throw new RuntimeException("Line #" + lineno(stok) + ": ", t); } } /** * This method looks up a class with its fully qualified name (FQN), or a short-name * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -> * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). * * If className contains a period, the class is first looked up as-is, assuming that it * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis * package prefix to the class name. * * If className does not contain a period, the analysis SPI *Factory.lookupClass() * methods are used to find the class. * * @param className The name or the short name of the class. * @param expectedType The superclass className is expected to extend * @return the loaded class. * @throws ClassNotFoundException if lookup fails */ public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType) throws ClassNotFoundException { if (className.contains(".")) { try { // First, try className == FQN return Class.forName(className).asSubclass(expectedType); } catch (ClassNotFoundException e) { try { // Second, retry lookup after prepending the Lucene analysis package prefix return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); } catch (ClassNotFoundException e1) { throw new ClassNotFoundException("Can't find class '" + className + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); } } } // No dot - use analysis SPI lookup final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); if (CharFilterFactory.class.isAssignableFrom(expectedType)) { return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } throw new ClassNotFoundException("Can't find class '" + className + "'"); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ @Override public boolean supportsParams() { return true; } /** Returns the current line in the algorithm file */ public int lineno(StreamTokenizer stok) { return getAlgLineNum() + stok.lineno(); } }