CommonTokenCountManager.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval.tokens;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.tika.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CommonTokenCountManager {
    private static final Logger LOG = LoggerFactory.getLogger(CommonTokenCountManager.class);

    private final static Charset COMMON_TOKENS_CHARSET = StandardCharsets.UTF_8;

    private final Path commonTokensDir;

    Map<String, Set<String>> commonTokenMap = new ConcurrentHashMap<>();
    Set<String> alreadyTriedToLoad = new HashSet<>();

    //if we have no model or if no langid is passed in
    //make this configurable
    private final String defaultLangCode;

    public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException {
        this.defaultLangCode = defaultLangCode;
        this.commonTokensDir = commonTokensDir;
        tryToLoad(defaultLangCode);
        //if you couldn't load it, make sure to add an empty
        //set to prevent npes later
        Set<String> set = commonTokenMap.get(defaultLangCode);
        if (set == null) {
            LOG.warn("No common tokens for default language: '"+defaultLangCode+"'");
            commonTokenMap.put(defaultLangCode, new HashSet<String>());
        }
    }

    public CommonTokenResult countTokenOverlaps(String langCode,
                                                Map<String, MutableInt> tokens) throws IOException {
        String actualLangCode = getActualLangCode(langCode);
        int overlap = 0;
        int alphabeticTokens = 0;
        Set<String> commonTokens = commonTokenMap.get(actualLangCode);
        for (Map.Entry<String, MutableInt> e : tokens.entrySet()) {
            String token = e.getKey();
            int count = e.getValue().intValue();
            if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray())) {
                alphabeticTokens += count;
            }
            if (commonTokens.contains(token)) {
                overlap += count;
            }

        }
        return new CommonTokenResult(actualLangCode, overlap, alphabeticTokens);
    }


    //return langcode for lang that you are actually using
    //lazily load the appropriate model
    private String getActualLangCode(String langCode) {
        if (langCode == null || "".equals(langCode)) {
            return defaultLangCode;
        }
        if (commonTokenMap.containsKey(langCode)) {
            return langCode;
        }
        tryToLoad(langCode);
        Set<String> set = commonTokenMap.get(langCode);
        if (set == null) {
            return defaultLangCode;
        }
        return langCode;

    }

    public void close() throws IOException {
        commonTokenMap.clear();
    }

    private synchronized void tryToLoad(String langCode) {
        if (alreadyTriedToLoad.contains(langCode)) {
            return;
        }
        //check once more now that we're in a
        //synchronized block
        if (commonTokenMap.get(langCode) != null) {
            return;
        }
        InputStream is = null;
        Path p = null;
        if (commonTokensDir != null) {
            p = commonTokensDir.resolve(langCode);
        }

        try {
            if (p == null || !Files.isRegularFile(p)) {
                is = this.getClass().getResourceAsStream("/common_tokens/" + langCode);
            } else {
                is = Files.newInputStream(p);
            }


            if (is == null) {
                LOG.warn("Couldn't find common tokens file for: '" + langCode + "': " +
                        p.toAbsolutePath());
                alreadyTriedToLoad.add(langCode);
                return;
            }


            Set<String> set = commonTokenMap.get(langCode);
            if (set == null) {
                set = new HashSet<>();
                commonTokenMap.put(langCode, set);
            }
            try (BufferedReader reader = new BufferedReader(
                    new InputStreamReader(is, COMMON_TOKENS_CHARSET))) {
                alreadyTriedToLoad.add(langCode);
                String line = reader.readLine();
                while (line != null) {
                    line = line.trim();
                    if (line.startsWith("#")) {
                        line = reader.readLine();
                        continue;
                    }
                    //allow language models with, e.g. tab-delimited counts after the term
                    String[] cols = line.split("\t");
                    String t = cols[0].trim();
                    if (t.length() > 0) {
                        set.add(t);
                    }

                    line = reader.readLine();
                }
            }
        } catch (IOException e) {
            LOG.warn("IOException trying to read: '" + langCode + "'");
        } finally {
            IOUtils.closeQuietly(is);
        }
    }

}