LuceneTokenCounter.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval.tokens;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.commons.math3.util.FastMath;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;

/**
 * Experimental class uses Lucene's MemoryIndex to effectively build the
 * token info.
 */
public class LuceneTokenCounter {
    private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";

    private final LeafReader leafReader;
    private final MemoryIndex memoryIndex;
    private final Analyzer generalAnalyzer;
    private int topN = 10;

    Map<String, TokenStatistics> fieldStats = new HashMap<>();

    public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException {
        memoryIndex = new MemoryIndex();
        IndexSearcher searcher = memoryIndex.createSearcher();
        leafReader = (LeafReader)searcher.getIndexReader();
        this.generalAnalyzer = generalAnalyzer;
    }

    public void add(String field, String content) throws IOException {
        memoryIndex.addField(field, content, generalAnalyzer);
        //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX,
        //        content, alphaIdeographAnalyzer);
        count(field);
        //count(field+ALPHA_IDEOGRAPH_SUFFIX);

    }


    void count(String field) throws IOException {
        long tokenCount = leafReader.getSumTotalTermFreq(field);
        if (tokenCount > Integer.MAX_VALUE) {
            throw new IllegalArgumentException("can't handle longs");
        }
        int tokenCountInt = (int)tokenCount;
        int uniqueTokenCount = 0;
        SummaryStatistics summStats = new SummaryStatistics();
        double ent = 0.0d;
        double p = 0.0d;
        double base = 2.0;

        Terms terms = leafReader.terms(field);
        if (terms == null) {
            //if there were no terms
            fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt,
                    new TokenIntPair[0], ent, summStats));
            return;

        }
        TermsEnum termsEnum = terms.iterator();
        BytesRef bytesRef = termsEnum.next();
        TokenCountPriorityQueue queue= new TokenCountPriorityQueue(topN);

        while (bytesRef != null) {

            long termFreq = termsEnum.totalTermFreq();
            if (termFreq > Integer.MAX_VALUE) {
                throw new IllegalArgumentException("Sorry can't handle longs yet");
            }
            int tf = (int)termFreq;
            //TODO: figure out how to avoid Stringifying this
            //to get codepoint count
            String t = bytesRef.utf8ToString();
            int len = t.codePointCount(0, t.length());
            for (int i = 0; i < tf; i++) {
                summStats.addValue(len);
            }
            p = (double) tf / (double) tokenCount;
            ent += p * FastMath.log(base, p);

            if (queue.top() == null || queue.size() < topN ||
                    tf >= queue.top().getValue()) {
                queue.insertWithOverflow(new TokenIntPair(t, tf));
            }

            uniqueTokenCount++;
            bytesRef = termsEnum.next();
        }
        if (tokenCountInt > 0) {
            ent = (-1.0d / (double)tokenCountInt) * ent;
        }

        fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt,
                queue.getArray(), ent, summStats));
    }

    public void setTopN(int topN) {
        this.topN = topN;
    }

    public TokenStatistics getTokenStatistics(String field) {
        return fieldStats.get(field);
    }

    public Terms getTerms(String field) throws IOException {
        return leafReader.terms(field);
    }


    public void clear() {
        memoryIndex.reset();
        fieldStats.clear();
    }
/*
    public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException {
        long diceDenom = getUniqueTokenCount(fieldA) +
                getUniqueTokenCount(fieldB);

        long diceNum = 0;
        long overlapNum = 0;

        Terms termsA = getTerms(fieldA);
        Terms termsB = getTerms(fieldB);

        TermsEnum termsEnumA = termsA.iterator();
        TermsEnum termsEnumB = termsB.iterator();

        BytesRef bytesRefA = termsEnumA.next();
        BytesRef bytesRefB = termsEnumB.next();

        while (bytesRefA != null) {
            int compare = bytesRefA.compareTo(bytesRefB);
            while (compare > 0) {
                if (bytesRefB == null) {
                    break;
                }
                //handle term in B, but not A

                compare = bytesRefA.compareTo(bytesRefB);
                bytesRefB = termsEnumB.next();
            }
            if (compare == 0) {
                diceNum += 2;
                overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), termsEnumB.totalTermFreq());
            }

            bytesRefA = termsEnumA.next();
        }


        for (PairCount p : tokens.values()) {
            if (p.a > 0 && p.b > 0) {
                diceNum += 2;
                overlapNum += 2 * Math.min(p.a, p.b);
            }
        }

        float dice = (float) diceNum / (float) diceDenom;
        float overlap = (float) overlapNum / (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount());
    }
*/
}