/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.eval.tokens; import java.io.IOException; import java.io.Reader; import java.lang.reflect.Type; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import com.google.gson.JsonArray; import com.google.gson.JsonDeserializationContext; import com.google.gson.JsonDeserializer; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParseException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.ClasspathResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> { private static final String ANALYZERS = "analyzers"; private static final String CHAR_FILTERS = "charfilters"; private static final String TOKEN_FILTERS = "tokenfilters"; private static final String TOKENIZER = "tokenizer"; private static final String FACTORY = "factory"; private static final String PARAMS = "params"; private static final String COMMENT = "_comment"; private final int maxTokens; AnalyzerDeserializer(int maxTokens) { this.maxTokens = maxTokens; } @Override public Map<String, Analyzer> deserialize(JsonElement element, Type type, JsonDeserializationContext jsonDeserializationContext) throws JsonParseException { if (! element.isJsonObject()) { throw new IllegalArgumentException("Expecting top level 'analyzers:{}'"); } JsonElement root = element.getAsJsonObject().get(ANALYZERS); if (root == null) { throw new IllegalArgumentException("Expecting top level 'analyzers:{}"); } try { return buildAnalyzers(root, maxTokens); } catch (IOException e) { throw new RuntimeException(e); } } public static Map<String, Analyzer> buildAnalyzers(JsonElement value, int maxTokens) throws IOException { if (! value.isJsonObject()) { throw new IllegalArgumentException("Expecting map with analyzer names/analyzer definitions"); } Map<String, Analyzer> analyzers = new HashMap<>(); JsonObject root = (JsonObject)value; for (Map.Entry<String, JsonElement> e : root.entrySet()) { String analyzerName = e.getKey(); Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue(), maxTokens); analyzers.put(analyzerName, analyzer); } return analyzers; } public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException { if (! value.isJsonObject()) { throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters"); } JsonObject aRoot = (JsonObject)value; CharFilterFactory[] charFilters = new CharFilterFactory[0]; TokenizerFactory tokenizerFactory = null; TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; for ( Map.Entry<String, JsonElement> e : aRoot.entrySet()) { String k = e.getKey(); if (k.equals(CHAR_FILTERS)) { charFilters = buildCharFilters(e.getValue(), analyzerName); } else if (k.equals(TOKEN_FILTERS)) { tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens); } else if (k.equals(TOKENIZER)) { tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName); } else if (! k.equals(COMMENT)) { throw new IllegalArgumentException("Should have one of three values here:"+ CHAR_FILTERS + ", "+ TOKENIZER+", "+ TOKEN_FILTERS + ". I don't recognize: "+k); } } if (tokenizerFactory == null) { throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!"); } return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories); } private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { if (!(map instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;"+ " not: "+map.toString() + " in "+analyzerName); } JsonElement factoryEl = ((JsonObject)map).get(FACTORY); if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:"+ analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject)map).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenizerFactory.availableTokenizers()) { Class clazz = TokenizerFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } if (spiName.equals("")) { throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name"+ "'"+factoryName+"' does not exist."); } try { TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } return tokenizerFactory; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While working on "+analyzerName, e); } } private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException { if (el == null || el.isJsonNull()) { return null; } if (! el.isJsonArray()) { throw new IllegalArgumentException("Expecting array for charfilters, but got:"+el.toString() + " for "+analyzerName); } JsonArray jsonArray = (JsonArray)el; List<CharFilterFactory> ret = new LinkedList<CharFilterFactory>(); for (JsonElement filterMap : jsonArray) { if (!(filterMap instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in char filter factory;"+ " not: "+filterMap.toString() + " in "+analyzerName); } JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY); if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException( "Expecting value for factory in char filter factory builder in:"+analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.replaceAll("oala.", "org.apache.lucene.analysis."); JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : CharFilterFactory.availableCharFilters()) { Class clazz = CharFilterFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } if (spiName.equals("")) { throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name"+ "'"+factoryName+"' does not exist."); } try { CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params); if (charFilterFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } ret.add(charFilterFactory); } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While trying to load "+ analyzerName + ": "+ e.getMessage(), e); } } if (ret.size() == 0) { return new CharFilterFactory[0]; } return ret.toArray(new CharFilterFactory[ret.size()]); } private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el, String analyzerName, int maxTokens) throws IOException { if (el == null || el.isJsonNull()) { return null; } if (! el.isJsonArray()) { throw new IllegalArgumentException( "Expecting array for tokenfilters, but got:"+el.toString() + " in "+analyzerName); } JsonArray jsonArray = (JsonArray)el; List<TokenFilterFactory> ret = new LinkedList<>(); for (JsonElement filterMap : jsonArray) { if (!(filterMap instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in token filter factory;"+ " not: "+filterMap.toString() + " in "+ analyzerName); } JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY); if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException("Expecting value for factory in token filter factory builder in "+analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenFilterFactory.availableTokenFilters()) { Class clazz = TokenFilterFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } if (spiName.equals("")) { throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name"+ "'"+factoryName+"' does not exist."); } try { TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params); if (tokenFilterFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } ret.add(tokenFilterFactory); } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While loading "+analyzerName, e); } } if (maxTokens > -1) { Map<String, String> m = new HashMap<>(); m.put("maxTokenCount", Integer.toString(maxTokens)); ret.add(new LimitTokenCountFilterFactory(m)); } if (ret.size() == 0) { return new TokenFilterFactory[0]; } return ret.toArray(new TokenFilterFactory[ret.size()]); } private static Map<String, String> mapify(JsonElement paramsEl) { if (paramsEl == null || paramsEl.isJsonNull()) { return Collections.EMPTY_MAP; } if (! paramsEl.isJsonObject()) { throw new IllegalArgumentException("Expecting map, not: "+paramsEl.toString()); } Map<String, String> params = new HashMap<>(); for (Map.Entry<String,JsonElement> e : ((JsonObject)paramsEl).entrySet()) { JsonElement value = e.getValue(); if (! value.isJsonPrimitive()) { throw new IllegalArgumentException("Expecting parameter to have primitive value: "+value.toString()); } String v = e.getValue().getAsString(); params.put(e.getKey(), v); } return params; } /** * Plagiarized verbatim from Solr! */ private static class MyTokenizerChain extends Analyzer { final private CharFilterFactory[] charFilters; final private TokenizerFactory tokenizer; final private TokenFilterFactory[] filters; public MyTokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) { this(null, tokenizer, filters); } public MyTokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) { this.charFilters = charFilters; this.tokenizer = tokenizer; this.filters = filters; } public CharFilterFactory[] getCharFilterFactories() { return charFilters; } public TokenizerFactory getTokenizerFactory() { return tokenizer; } public TokenFilterFactory[] getTokenFilterFactories() { return filters; } @Override public Reader initReader(String fieldName, Reader reader) { if (charFilters != null && charFilters.length > 0) { Reader cs = reader; for (CharFilterFactory charFilter : charFilters) { cs = charFilter.create(cs); } reader = cs; } return reader; } @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tk = tokenizer.create(); TokenStream ts = tk; for (TokenFilterFactory filter : filters) { ts = filter.create(ts); } return new TokenStreamComponents(tk, ts); } } }