/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.eval.tokens; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.util.FastMath; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.util.BytesRef; /** * Experimental class uses Lucene's MemoryIndex to effectively build the * token info. */ public class LuceneTokenCounter { private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; private final LeafReader leafReader; private final MemoryIndex memoryIndex; private final Analyzer generalAnalyzer; private int topN = 10; Map<String, TokenStatistics> fieldStats = new HashMap<>(); public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException { memoryIndex = new MemoryIndex(); IndexSearcher searcher = memoryIndex.createSearcher(); leafReader = (LeafReader)searcher.getIndexReader(); this.generalAnalyzer = generalAnalyzer; } public void add(String field, String content) throws IOException { memoryIndex.addField(field, content, generalAnalyzer); //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX, // content, alphaIdeographAnalyzer); count(field); //count(field+ALPHA_IDEOGRAPH_SUFFIX); } void count(String field) throws IOException { long tokenCount = leafReader.getSumTotalTermFreq(field); if (tokenCount > Integer.MAX_VALUE) { throw new IllegalArgumentException("can't handle longs"); } int tokenCountInt = (int)tokenCount; int uniqueTokenCount = 0; SummaryStatistics summStats = new SummaryStatistics(); double ent = 0.0d; double p = 0.0d; double base = 2.0; Terms terms = leafReader.terms(field); if (terms == null) { //if there were no terms fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats)); return; } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); TokenCountPriorityQueue queue= new TokenCountPriorityQueue(topN); while (bytesRef != null) { long termFreq = termsEnum.totalTermFreq(); if (termFreq > Integer.MAX_VALUE) { throw new IllegalArgumentException("Sorry can't handle longs yet"); } int tf = (int)termFreq; //TODO: figure out how to avoid Stringifying this //to get codepoint count String t = bytesRef.utf8ToString(); int len = t.codePointCount(0, t.length()); for (int i = 0; i < tf; i++) { summStats.addValue(len); } p = (double) tf / (double) tokenCount; ent += p * FastMath.log(base, p); if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(t, tf)); } uniqueTokenCount++; bytesRef = termsEnum.next(); } if (tokenCountInt > 0) { ent = (-1.0d / (double)tokenCountInt) * ent; } fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats)); } public void setTopN(int topN) { this.topN = topN; } public TokenStatistics getTokenStatistics(String field) { return fieldStats.get(field); } public Terms getTerms(String field) throws IOException { return leafReader.terms(field); } public void clear() { memoryIndex.reset(); fieldStats.clear(); } /* public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException { long diceDenom = getUniqueTokenCount(fieldA) + getUniqueTokenCount(fieldB); long diceNum = 0; long overlapNum = 0; Terms termsA = getTerms(fieldA); Terms termsB = getTerms(fieldB); TermsEnum termsEnumA = termsA.iterator(); TermsEnum termsEnumB = termsB.iterator(); BytesRef bytesRefA = termsEnumA.next(); BytesRef bytesRefB = termsEnumB.next(); while (bytesRefA != null) { int compare = bytesRefA.compareTo(bytesRefB); while (compare > 0) { if (bytesRefB == null) { break; } //handle term in B, but not A compare = bytesRefA.compareTo(bytesRefB); bytesRefB = termsEnumB.next(); } if (compare == 0) { diceNum += 2; overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), termsEnumB.totalTermFreq()); } bytesRefA = termsEnumA.next(); } for (PairCount p : tokens.values()) { if (p.a > 0 && p.b > 0) { diceNum += 2; overlapNum += 2 * Math.min(p.a, p.b); } } float dice = (float) diceNum / (float) diceDenom; float overlap = (float) overlapNum / (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount()); } */ }