package org.apache.lucene.search.concordance.windowvisitor; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.corpus.stats.TermDFTF; import org.apache.lucene.util.mutable.MutableValueInt; /** * The ArrayWindowSearcher must visit all windows in a document before * moving on to a new document. If (for some unforeseen reason...multithreading?), * the Searcher visits two windows in doc1, a window in doc2 and then another window in doc1, * the doc frequency counts will double count the targets in doc1. */ public class TargetVisitor extends ArrayWindowVisitor<List<TermDFTF>> { private final static String JOINER = " "; private final int numResults; private final Map<String, MutableValueInt> tf = new HashMap<>(); private final Map<String, MutableValueInt> df = new HashMap<>(); private String lastDocId = null; //cache of terms seen in current doc //this is reset with each new doc private final Set<String> seenInThisDoc = new HashSet<>(); public TargetVisitor(String fieldName, int numResults, boolean analyzeTarget, int maxWindows) { super(fieldName, 0, 0, true, analyzeTarget, maxWindows); this.numResults = numResults; } @Override public void visit(String docId, ConcordanceArrayWindow window) throws IOException { if (getNumWindowsVisited() >= getMaxWindows()) { setHitMax(true); return; } //will throw NPE if docId is null if (lastDocId != null && !lastDocId.equals(docId)) { seenInThisDoc.clear(); } String targ = ""; StringBuilder sb = new StringBuilder(); List<String> parts = window.getRawTargList(); if (parts.size() == 0) { targ = ""; } else { sb.append(ConcordanceArrayWindow.tokenToString(parts.get(0))); for (int i = 1; i < parts.size(); i++) { sb.append(JOINER).append(ConcordanceArrayWindow.tokenToString(parts.get(i))); } targ = sb.toString(); } MutableValueInt cnt = tf.get(targ); if (cnt == null) { cnt = new MutableValueInt(); cnt.value = 1; } else { cnt.value++; } tf.put(targ, cnt); if (!seenInThisDoc.contains(targ)) { cnt = df.get(targ); if (cnt == null) { cnt = new MutableValueInt(); cnt.value = 1; } else { cnt.value++; } df.put(targ, cnt); } seenInThisDoc.add(targ); lastDocId = docId; finishedVisit(docId, true); } @Override public List<TermDFTF> getResults() { List<TermDFTF> list = new ArrayList<>(); for (Map.Entry<String, MutableValueInt> entry : df.entrySet()) { String key = entry.getKey(); int docFreq = entry.getValue().value; MutableValueInt mutTF = tf.get(key); int termFreq = (mutTF == null) ? 0 : mutTF.value; list.add(new TermDFTF(key, docFreq, termFreq)); } Collections.sort(list); //if list is short enough, return now if (list.size() <= numResults) { return list; } //copy over only the required results List<TermDFTF> ret = new ArrayList<>(); int i = 0; for (TermDFTF t : list) { if (i++ >= numResults) { break; } ret.add(t); } return ret; } public int getUniqTermCounts() { return tf.keySet().size(); } }