/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.request; import org.apache.lucene.search.FieldCache; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.util.PagedBytes; import org.apache.noggit.CharArr; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.SolrException; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.TrieField; import org.apache.solr.search.*; import org.apache.solr.util.LongPriorityQueue; import org.apache.solr.util.PrimUtils; import org.apache.solr.util.BoundedTreeSet; import org.apache.solr.handler.component.StatsValues; import org.apache.solr.handler.component.FieldFacetStats; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Bits; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.Comparator; import java.util.concurrent.atomic.AtomicLong; /** * * Final form of the un-inverted field: * Each document points to a list of term numbers that are contained in that document. * * Term numbers are in sorted order, and are encoded as variable-length deltas from the * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A * term number of 0 signals the end of the termNumber list. * * There is a single int[maxDoc()] which either contains a pointer into a byte[] for * the termNumber lists, or directly contains the termNumber list if it fits in the 4 * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes * are a pointer into a byte[] where the termNumber list starts. * * There are actually 256 byte arrays, to compensate for the fact that the pointers * into the byte arrays are only 3 bytes long. The correct byte array for a document * is a function of it's id. * * To save space and speed up faceting, any term that matches enough documents will * not be un-inverted... it will be skipped while building the un-inverted field structure, * and will use a set intersection method during faceting. * * To further save memory, the terms (the actual string values) are not all stored in * memory, but a TermIndex is used to convert term numbers to term values only * for the terms needed after faceting has completed. Only every 128th term value * is stored, along with it's corresponding term number, and this is used as an * index to find the closest term and iterate until the desired number is hit (very * much like Lucene's own internal term index). * */ public class UnInvertedField { private static int TNUM_OFFSET=2; static class TopTerm { BytesRef term; int termNum; long memSize() { return 8 + // obj header 8 + 8 +term.length + //term 4; // int } } String field; int numTermsInField; int termsInverted; // number of unique terms that were un-inverted long termInstances; // total number of references to term numbers final TermIndex ti; long memsz; int total_time; // total time to uninvert the field int phase1_time; // time for phase1 of the uninvert process final AtomicLong use = new AtomicLong(); // number of uses int[] index; byte[][] tnums = new byte[256][]; int[] maxTermCounts; final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<Integer,TopTerm>(); public long memSize() { // can cache the mem size since it shouldn't change if (memsz!=0) return memsz; long sz = 8*8 + 32; // local fields sz += bigTerms.size() * 64; for (TopTerm tt : bigTerms.values()) { sz += tt.memSize(); } if (index != null) sz += index.length * 4; if (tnums!=null) { for (byte[] arr : tnums) if (arr != null) sz += arr.length; } if (maxTermCounts != null) sz += maxTermCounts.length * 4; sz += ti.memSize(); memsz = sz; return sz; } /** Number of bytes to represent an unsigned int as a vint. */ static int vIntSize(int x) { if ((x & (0xffffffff << (7*1))) == 0 ) { return 1; } if ((x & (0xffffffff << (7*2))) == 0 ) { return 2; } if ((x & (0xffffffff << (7*3))) == 0 ) { return 3; } if ((x & (0xffffffff << (7*4))) == 0 ) { return 4; } return 5; } // todo: if we know the size of the vInt already, we could do // a single switch on the size static int writeInt(int x, byte[] arr, int pos) { int a; a = (x >>> (7*4)); if (a != 0) { arr[pos++] = (byte)(a | 0x80); } a = (x >>> (7*3)); if (a != 0) { arr[pos++] = (byte)(a | 0x80); } a = (x >>> (7*2)); if (a != 0) { arr[pos++] = (byte)(a | 0x80); } a = (x >>> (7*1)); if (a != 0) { arr[pos++] = (byte)(a | 0x80); } arr[pos++] = (byte)(x & 0x7f); return pos; } public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { this.field = field; this.ti = new TermIndex(field, TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field))); uninvert(searcher); } private void uninvert(SolrIndexSearcher searcher) throws IOException { long startTime = System.currentTimeMillis(); IndexReader reader = searcher.getReader(); int maxDoc = reader.maxDoc(); int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number this.index = index; final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) maxTermCounts = new int[1024]; NumberedTermsEnum te = ti.getEnumerator(reader); // threshold, over which we use set intersections instead of counting // to (1) save memory, and (2) speed up faceting. // Add 2 for testing purposes so that there will always be some terms under // the threshold even when the index is very small. int threshold = maxDoc / 20 + 2; // threshold = 2000000000; //////////////////////////////// USE FOR TESTING // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. SolrIndexSearcher.DocsEnumState deState = null; for (;;) { BytesRef t = te.term(); if (t==null) break; int termNum = te.getTermNumber(); if (termNum >= maxTermCounts.length) { // resize by doubling - for very large number of unique terms, expanding // by 4K and resultant GC will dominate uninvert times. Resize at end if material int[] newMaxTermCounts = new int[maxTermCounts.length*2]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); maxTermCounts = newMaxTermCounts; } int df = te.docFreq(); if (df >= threshold) { TopTerm topTerm = new TopTerm(); topTerm.term = new BytesRef(t); topTerm.termNum = termNum; bigTerms.put(topTerm.termNum, topTerm); if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.termsEnum = te.tenum; deState.reuse = te.docsEnum; } DocSet set = searcher.getDocSet(new TermQuery(new Term(ti.field, topTerm.term)), deState); te.docsEnum = deState.reuse; maxTermCounts[termNum] = set.size(); te.next(); continue; } termsInverted++; DocsEnum docsEnum = te.getDocsEnum(); DocsEnum.BulkReadResult bulkResult = docsEnum.getBulkResult(); for(;;) { int n = docsEnum.read(); if (n <= 0) break; maxTermCounts[termNum] += n; for (int i=0; i<n; i++) { termInstances++; int doc = bulkResult.docs.ints[i]; // add 2 to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff)==1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos+ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos<<8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val==0) { ipos=0; } else if ((val & 0x0000ff80)==0) { ipos=1; } else if ((val & 0x00ff8000)==0) { ipos=2; } else if ((val & 0xff800000)==0) { ipos=3; } else { ipos=4; } int endPos = writeInt(delta, tempArr, ipos); if (endPos <= 4) { // value will fit in the integer... move bytes back for (int j=ipos; j<endPos; j++) { val |= (tempArr[j] & 0xff) << (j<<3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j=0; j<ipos; j++) { tempArr[j] = (byte)val; val >>>=8; } // point at the end index in the byte[] index[doc] = (endPos<<8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } } te.next(); } numTermsInField = te.getTermNumber(); te.close(); // free space if outrageously wasteful (tradeoff memory/cpu) if ((maxTermCounts.length - numTermsInField) > 1024) { // too much waste! int[] newMaxTermCounts = new int[numTermsInField]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, numTermsInField); maxTermCounts = newMaxTermCounts; } long midPoint = System.currentTimeMillis(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. index = this.index = null; tnums = null; } else { // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass<256; pass++) { byte[] target = tnums[pass]; int pos=0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) { int lim = Math.min(docbase + (1<<16), maxDoc); for (int doc=docbase; doc<lim; doc++) { int val = index[doc]; if ((val&0xff) == 1) { int len = val >>> 8; index[doc] = (pos<<8)|1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Too many values for UnInvertedField faceting on field "+field); } byte[] arr = bytes[doc]; bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen<<=1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; if (target.length > (1<<24)*.9) { SolrCore.log.warn("Approaching too many values for UnInvertedField faceting on field '"+field+"' : bucket size=" + target.length); } } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } long endTime = System.currentTimeMillis(); total_time = (int)(endTime-startTime); phase1_time = (int)(midPoint-startTime); SolrCore.log.info("UnInverted multi-valued field " + toString()); } public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException { use.incrementAndGet(); FieldType ft = searcher.getSchema().getFieldType(field); NamedList res = new NamedList(); // order is important DocSet docs = baseDocs; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); if (baseSize >= mincount) { final int[] index = this.index; // tricky: we add more more element than we need because we will reuse this array later // for ordering term ords before converting to term labels. final int[] counts = new int[numTermsInField + 1]; // // If there is prefix, find it's start and end term numbers // int startTerm = 0; int endTerm = numTermsInField; // one past the end NumberedTermsEnum te = ti.getEnumerator(searcher.getReader()); if (prefix != null && prefix.length() > 0) { te.skipTo(new BytesRef(prefix)); startTerm = te.getTermNumber(); te.skipTo(new BytesRef(prefix + "\uffff\uffff\uffff\uffff")); endTerm = te.getTermNumber(); } /*********** // Alternative 2: get the docSet of the prefix (could take a while) and // then do the intersection with the baseDocSet first. if (prefix != null && prefix.length() > 0) { docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs); // The issue with this method are problems of returning 0 counts for terms w/o // the prefix. We can't just filter out those terms later because it may // mean that we didn't collect enough terms in the queue (in the sorted case). } ***********/ boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && startTerm==0 && endTerm==numTermsInField && docs instanceof BitDocSet; if (doNegative) { OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone(); bs.flip(0, maxDoc); // TODO: when iterator across negative elements is available, use that // instead of creating a new bitset and inverting. docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorted==false if (tt.termNum >= startTerm && tt.termNum < endTerm) { counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term)), docs); } } // TODO: we could short-circuit counting altogether for sorted faceting // where we already have enough terms from the bigTerms // TODO: we could shrink the size of the collection array, and // additionally break when the termNumber got above endTerm, but // it would require two extra conditionals in the inner loop (although // they would be predictable for the non-prefix case). // Perhaps a different copy of the code would be warranted. if (termInstances > 0) { DocIterator iter = docs.iterator(); while (iter.hasNext()) { int doc = iter.nextDoc(); int code = index[doc]; if ((code & 0xff)==1) { int pos = code>>>8; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for(;;) { int delta = 0; for(;;) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; counts[tnum]++; } } else { int tnum = 0; int delta = 0; for (;;) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80)==0) { if (delta==0) break; tnum += delta - TNUM_OFFSET; counts[tnum]++; delta = 0; } code >>>= 8; } } } } CharArr spare = new CharArr(); int off=offset; int lim=limit>=0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; maxsize = Math.min(maxsize, numTermsInField); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE); int min=mincount-1; // the smallest value in the top 'N' values for (int i=startTerm; i<endTerm; i++) { int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; if (c>min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min=(int)(queue.top() >>> 32); } } // now select the right page from the results // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); final int[] indirect = counts; // reuse the counts array for the index into the tnums array assert indirect.length >= sortedIdxEnd; for (int i=sortedIdxStart; i<sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int)(pair >>> 32); int tnum = Integer.MAX_VALUE - (int)pair; indirect[i] = i; // store the index for indirect sorting sorted[i] = tnum; // reuse the "sorted" array to store the term numbers for indirect sorting // add a null label for now... we'll fill it in later. res.add(null, c); } // now sort the indexes by the term numbers PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new PrimUtils.IntComparator() { @Override public int compare(int a, int b) { return (int)sorted[a] - (int)sorted[b]; } @Override public boolean lessThan(int a, int b) { return sorted[a] < sorted[b]; } @Override public boolean equals(int a, int b) { return sorted[a] == sorted[b]; } }); // convert the term numbers to term values and set as the label for (int i=sortedIdxStart; i<sortedIdxEnd; i++) { int idx = indirect[i]; int tnum = (int)sorted[idx]; String label = getReadableValue(getTermValue(te, tnum), ft, spare); res.setName(idx - sortedIdxStart, label); } } else { // add results in index order int i=startTerm; if (mincount<=0) { // if mincount<=0, then we won't discard any terms and we know exactly // where to start. i=startTerm+off; off=0; } for (; i<endTerm; i++) { int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; if (c<mincount || --off>=0) continue; if (--lim<0) break; String label = getReadableValue(getTermValue(te, i), ft, spare); res.add(label, c); } } te.close(); } if (missing) { // TODO: a faster solution for this? res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field)); } return res; } /** * Collect statistics about the UninvertedField. Code is very similar to {@link #getCounts(org.apache.solr.search.SolrIndexSearcher, org.apache.solr.search.DocSet, int, int, Integer, boolean, String, String)} * It can be used to calculate stats on multivalued fields. * <p/> * This method is mainly used by the {@link org.apache.solr.handler.component.StatsComponent}. * * @param searcher The Searcher to use to gather the statistics * @param baseDocs The {@link org.apache.solr.search.DocSet} to gather the stats on * @param facet One or more fields to facet on. * @return The {@link org.apache.solr.handler.component.StatsValues} collected * @throws IOException */ public StatsValues getStats(SolrIndexSearcher searcher, DocSet baseDocs, String[] facet) throws IOException { //this function is ripped off nearly wholesale from the getCounts function to use //for multiValued fields within the StatsComponent. may be useful to find common //functionality between the two and refactor code somewhat use.incrementAndGet(); StatsValues allstats = new StatsValues(); DocSet docs = baseDocs; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); if (baseSize <= 0) return allstats; FieldType ft = searcher.getSchema().getFieldType(field); DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(field, null, null, false, false)) ); int i = 0; final FieldFacetStats[] finfo = new FieldFacetStats[facet.length]; //Initialize facetstats, if facets have been passed in FieldCache.DocTermsIndex si; for (String f : facet) { FieldType facet_ft = searcher.getSchema().getFieldType(f); try { si = FieldCache.DEFAULT.getTermsIndex(searcher.getReader(), f); } catch (IOException e) { throw new RuntimeException("failed to open field cache for: " + f, e); } finfo[i] = new FieldFacetStats(f, si, facet_ft, numTermsInField); i++; } final int[] index = this.index; final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset NumberedTermsEnum te = ti.getEnumerator(searcher.getReader()); boolean doNegative = false; if (finfo.length == 0) { //if we're collecting statistics with a facet field, can't do inverted counting doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet; } if (doNegative) { OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone(); bs.flip(0, maxDoc); // TODO: when iterator across negative elements is available, use that // instead of creating a new bitset and inverting. docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorted==false if (tt.termNum >= 0 && tt.termNum < numTermsInField) { final Term t = new Term(ti.field, tt.term); if (finfo.length == 0) { counts[tt.termNum] = searcher.numDocs(new TermQuery(t), docs); } else { //COULD BE VERY SLOW //if we're collecting stats for facet fields, we need to iterate on all matching documents DocSet bigTermDocSet = searcher.getDocSet(new TermQuery(t)).intersection(docs); DocIterator iter = bigTermDocSet.iterator(); while (iter.hasNext()) { int doc = iter.nextDoc(); counts[tt.termNum]++; for (FieldFacetStats f : finfo) { f.facetTermNum(doc, tt.termNum); } } } } } if (termInstances > 0) { DocIterator iter = docs.iterator(); while (iter.hasNext()) { int doc = iter.nextDoc(); int code = index[doc]; if ((code & 0xff) == 1) { int pos = code >>> 8; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for (; ;) { int delta = 0; for (; ;) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; counts[tnum]++; for (FieldFacetStats f : finfo) { f.facetTermNum(doc, tnum); } } } else { int tnum = 0; int delta = 0; for (; ;) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; counts[tnum]++; for (FieldFacetStats f : finfo) { f.facetTermNum(doc, tnum); } delta = 0; } code >>>= 8; } } } } // add results in index order CharArr spare = new CharArr(); for (i = 0; i < numTermsInField; i++) { int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; if (c == 0) continue; String label = getReadableValue(getTermValue(te, i), ft, spare); // TODO: we should avoid this re-parse Double value = Double.parseDouble(label); allstats.accumulate(value, c); //as we've parsed the termnum into a value, lets also accumulate fieldfacet statistics for (FieldFacetStats f : finfo) { f.accumulateTermNum(i, value); } } te.close(); int c = missing.size(); allstats.addMissing(c); if (finfo.length > 0) { allstats.facets = new HashMap<String, Map<String, StatsValues>>(); for (FieldFacetStats f : finfo) { Map<String, StatsValues> facetStatsValues = f.facetStatsValues; FieldType facetType = searcher.getSchema().getFieldType(f.name); for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) { String termLabel = entry.getKey(); int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing); entry.getValue().addMissing(missingCount); } allstats.facets.put(f.name, facetStatsValues); } } return allstats; } String getReadableValue(BytesRef termval, FieldType ft, CharArr spare) { if (spare == null) { spare = new CharArr(); } else { spare.reset(); } ft.indexedToReadable(termval, spare); return spare.toString(); } /** may return a reused BytesRef */ BytesRef getTermValue(NumberedTermsEnum te, int termNum) throws IOException { if (bigTerms.size() > 0) { // see if the term is one of our big terms. TopTerm tt = bigTerms.get(termNum); if (tt != null) { return tt.term; } } return te.skipTo(termNum); } public String toString() { return "{field=" + field + ",memSize="+memSize() + ",tindexSize="+ti.memSize() + ",time="+total_time + ",phase1="+phase1_time + ",nTerms="+numTermsInField + ",bigTerms="+bigTerms.size() + ",termInstances="+termInstances + ",uses="+use.get() + "}"; } ////////////////////////////////////////////////////////////////// //////////////////////////// caching ///////////////////////////// ////////////////////////////////////////////////////////////////// public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { SolrCache cache = searcher.getFieldValueCache(); if (cache == null) { return new UnInvertedField(field, searcher); } UnInvertedField uif = (UnInvertedField)cache.get(field); if (uif == null) { synchronized (cache) { uif = (UnInvertedField)cache.get(field); if (uif == null) { uif = new UnInvertedField(field, searcher); cache.put(field, uif); } } } return uif; } } // How to share TermDocs (int[] score[])??? // Hot to share TermPositions? /*** class TermEnumListener { void doTerm(Term t) { } void done() { } } ***/ class NumberedTermsEnum extends TermsEnum { protected final IndexReader reader; protected final TermIndex tindex; protected TermsEnum tenum; protected int pos=-1; protected BytesRef termText; protected DocsEnum docsEnum; protected Bits deletedDocs; NumberedTermsEnum(IndexReader reader, TermIndex tindex) throws IOException { this.reader = reader; this.tindex = tindex; } NumberedTermsEnum(IndexReader reader, TermIndex tindex, BytesRef termValue, int pos) throws IOException { this.reader = reader; this.tindex = tindex; this.pos = pos; Terms terms = MultiFields.getTerms(reader, tindex.field); deletedDocs = MultiFields.getDeletedDocs(reader); if (terms != null) { tenum = terms.iterator(); tenum.seek(termValue); setTerm(); } } @Override public Comparator<BytesRef> getComparator() throws IOException { return tenum.getComparator(); } public DocsEnum getDocsEnum() throws IOException { docsEnum = tenum.docs(deletedDocs, docsEnum); return docsEnum; } protected BytesRef setTerm() throws IOException { termText = tenum.term(); if (tindex.prefix != null && !termText.startsWith(tindex.prefix)) { termText = null; } return termText; } @Override public BytesRef next() throws IOException { pos++; if (tenum.next() == null) { termText = null; return null; } return setTerm(); // this is extra work if we know we are in bounds... } @Override public BytesRef term() { return termText; } @Override public int docFreq() { return tenum.docFreq(); } public BytesRef skipTo(BytesRef target) throws IOException { // already here if (termText != null && termText.equals(target)) return termText; if (tenum == null) { return null; } int startIdx = Arrays.binarySearch(tindex.index,target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = tenum.seek(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; pos = startIdx << tindex.intervalBits; return setTerm(); } // we didn't hit the term exactly startIdx=-startIdx-1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = tenum.seek(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; pos = 0; return setTerm(); } // back up to the start of the block startIdx--; if ((pos >> tindex.intervalBits) == startIdx && termText != null && termText.compareTo(target)<=0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = tenum.seek(tindex.index[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; pos = startIdx << tindex.intervalBits; setTerm(); // should be non-null since it's in the index } while (termText != null && termText.compareTo(target) < 0) { next(); } return termText; } public BytesRef skipTo(int termNumber) throws IOException { int delta = termNumber - pos; if (delta < 0 || delta > tindex.interval || tenum==null) { int idx = termNumber >>> tindex.intervalBits; BytesRef base = tindex.index[idx]; pos = idx << tindex.intervalBits; delta = termNumber - pos; TermsEnum.SeekStatus seekStatus = tenum.seek(base); assert seekStatus == TermsEnum.SeekStatus.FOUND; } while (--delta >= 0) { BytesRef br = tenum.next(); if (br == null) { termText = null; return null; } ++pos; } return setTerm(); } protected void close() throws IOException { // no-op, needed so the anon subclass that does indexing // can build its index } /** The current term number, starting at 0. * Only valid if the previous call to next() or skipTo() returned true. */ public int getTermNumber() { return pos; } @Override public long ord() { throw new UnsupportedOperationException(); } @Override public SeekStatus seek(long ord) { throw new UnsupportedOperationException(); } @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { throw new UnsupportedOperationException(); } @Override public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) { throw new UnsupportedOperationException(); } @Override public SeekStatus seek(BytesRef target, boolean useCache) { throw new UnsupportedOperationException(); } } /** * Class to save memory by only storing every nth term (for random access), while * numbering the terms, allowing them to be retrieved later by number. * This is only valid when used with the IndexReader it was created with. * The IndexReader is not actually stored to facilitate caching by using it as a key in * a weak hash map. */ class TermIndex { final static int intervalBits = 7; // decrease to a low number like 2 for testing final static int intervalMask = 0xffffffff >>> (32-intervalBits); final static int interval = 1 << intervalBits; final String field; final BytesRef prefix; BytesRef[] index; int nTerms; long sizeOfStrings; TermIndex(String field) { this(field, null); } TermIndex(String field, String prefix) { this.field = field; this.prefix = prefix == null ? null : new BytesRef(prefix); } NumberedTermsEnum getEnumerator(IndexReader reader, int termNumber) throws IOException { NumberedTermsEnum te = new NumberedTermsEnum(reader, this); te.skipTo(termNumber); return te; } /* The first time an enumerator is requested, it should be used with next() to fully traverse all of the terms so the index will be built. */ NumberedTermsEnum getEnumerator(IndexReader reader) throws IOException { if (index==null) return new NumberedTermsEnum(reader,this, prefix==null?new BytesRef():prefix, 0) { ArrayList<BytesRef> lst; PagedBytes bytes; protected BytesRef setTerm() throws IOException { BytesRef br = super.setTerm(); if (br != null && (pos & intervalMask)==0) { sizeOfStrings += br.length; if (lst==null) { lst = new ArrayList<BytesRef>(); bytes = new PagedBytes(15); } BytesRef out = new BytesRef(); bytes.copy(br, out); lst.add(out); } return br; } public BytesRef skipTo(int termNumber) throws IOException { throw new UnsupportedOperationException(); } public void close() throws IOException { nTerms=pos; super.close(); index = lst!=null ? lst.toArray(new BytesRef[lst.size()]) : new BytesRef[0]; } }; else return new NumberedTermsEnum(reader,this,new BytesRef(),0); } /** * Returns the approximate amount of memory taken by this TermIndex. * This is only an approximation and doesn't take into account java object overhead. * * @return * the approximate memory consumption in bytes */ public long memSize() { // assume 8 byte references? return 8+8+8+8+(index.length<<3)+sizeOfStrings; } }