/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.eval.tokens; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.tika.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class CommonTokenCountManager { private static final Logger LOG = LoggerFactory.getLogger(CommonTokenCountManager.class); private final static Charset COMMON_TOKENS_CHARSET = StandardCharsets.UTF_8; private final Path commonTokensDir; Map<String, Set<String>> commonTokenMap = new ConcurrentHashMap<>(); Set<String> alreadyTriedToLoad = new HashSet<>(); //if we have no model or if no langid is passed in //make this configurable private final String defaultLangCode; public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException { this.defaultLangCode = defaultLangCode; this.commonTokensDir = commonTokensDir; tryToLoad(defaultLangCode); //if you couldn't load it, make sure to add an empty //set to prevent npes later Set<String> set = commonTokenMap.get(defaultLangCode); if (set == null) { LOG.warn("No common tokens for default language: '"+defaultLangCode+"'"); commonTokenMap.put(defaultLangCode, new HashSet<String>()); } } public CommonTokenResult countTokenOverlaps(String langCode, Map<String, MutableInt> tokens) throws IOException { String actualLangCode = getActualLangCode(langCode); int overlap = 0; int alphabeticTokens = 0; Set<String> commonTokens = commonTokenMap.get(actualLangCode); for (Map.Entry<String, MutableInt> e : tokens.entrySet()) { String token = e.getKey(); int count = e.getValue().intValue(); if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray())) { alphabeticTokens += count; } if (commonTokens.contains(token)) { overlap += count; } } return new CommonTokenResult(actualLangCode, overlap, alphabeticTokens); } //return langcode for lang that you are actually using //lazily load the appropriate model private String getActualLangCode(String langCode) { if (langCode == null || "".equals(langCode)) { return defaultLangCode; } if (commonTokenMap.containsKey(langCode)) { return langCode; } tryToLoad(langCode); Set<String> set = commonTokenMap.get(langCode); if (set == null) { return defaultLangCode; } return langCode; } public void close() throws IOException { commonTokenMap.clear(); } private synchronized void tryToLoad(String langCode) { if (alreadyTriedToLoad.contains(langCode)) { return; } //check once more now that we're in a //synchronized block if (commonTokenMap.get(langCode) != null) { return; } InputStream is = null; Path p = null; if (commonTokensDir != null) { p = commonTokensDir.resolve(langCode); } try { if (p == null || !Files.isRegularFile(p)) { is = this.getClass().getResourceAsStream("/common_tokens/" + langCode); } else { is = Files.newInputStream(p); } if (is == null) { LOG.warn("Couldn't find common tokens file for: '" + langCode + "': " + p.toAbsolutePath()); alreadyTriedToLoad.add(langCode); return; } Set<String> set = commonTokenMap.get(langCode); if (set == null) { set = new HashSet<>(); commonTokenMap.put(langCode, set); } try (BufferedReader reader = new BufferedReader( new InputStreamReader(is, COMMON_TOKENS_CHARSET))) { alreadyTriedToLoad.add(langCode); String line = reader.readLine(); while (line != null) { line = line.trim(); if (line.startsWith("#")) { line = reader.readLine(); continue; } //allow language models with, e.g. tab-delimited counts after the term String[] cols = line.split("\t"); String t = cols[0].trim(); if (t.length() > 0) { set.add(t); } line = reader.readLine(); } } } catch (IOException e) { LOG.warn("IOException trying to read: '" + langCode + "'"); } finally { IOUtils.closeQuietly(is); } } }