package org.apache.solr.request.uninverted; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.PriorityQueue; import java.util.zip.CRC32; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.solr.request.mdrill.MdrillUtils; import org.apache.solr.request.uninverted.UnInvertedFieldUtils.FieldDatatype; import org.apache.solr.request.uninverted.UnInvertedFieldUtils.MixTermInfo; import org.apache.solr.schema.IndexSchema; import org.apache.solr.search.BitDocSet; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocSet; import org.apache.solr.search.DocSetCollector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * 历史的旧方法,通过遍历倒排表来实现,效率不高 * @author yannian.mu * */ public class MakeUnivertedFieldByIndex { public static Logger log = LoggerFactory.getLogger(MakeUnivertedFieldByIndex.class); private static int SKIP_STEP = 32; private static int SKIP_MIN = 64; private static int MAX_SKIP_COUNT = 102400; private static long TD_MAX_CMP_COUNT = 64000000l; private static int TD_LIMIT_MAX = 10240; private static int TD_LIMIT_MIN = 8; private int[] docs = new int[1000]; private int[] freqs = new int[1000]; private UnInvertedField uni; public MakeUnivertedFieldByIndex(UnInvertedField uni) throws IOException { this.uni=uni; } public void makeInit(BitDocSet baseAdvanceDocs,String field, IndexSchema schema,IndexReader reader) throws IOException { uni.init(field, reader, schema); uni.baseAdvanceDocs=UnInvertedField.ajustBase(48,baseAdvanceDocs, reader); if(this.uni.checkEmpty()) { return ; } log.info(" makeInit begin " + this.uni.field + " field " +",baseAdvanceDocs="+(this.uni.baseAdvanceDocs==null?"null":this.uni.baseAdvanceDocs.size())+"@"+(baseAdvanceDocs==null?"null":baseAdvanceDocs.size())); TermNumEnumerator te = uni.ti.getEnumerator(reader); int maxDoc = reader.maxDoc(); this.uni.startRamDocValue(maxDoc, reader, true); int limitsize=this.getLimitSize(); PriorityQueue<MixTermInfo> termDocslist=new PriorityQueue<MixTermInfo>(limitsize,Collections.reverseOrder(UnInvertedFieldUtils.TD_CMP)); TermDocs tdreader=reader.termDocs(1024); int maxTermNum=0; for (;;) { Term t = te.term(); if (t == null) { break; } int termNum = te.getTermNumber(); if(termNum%100000==0) { log.info("termsInverted " +termNum+"@"+ this.uni.field + ",limitsize=" + limitsize); } int df=te.docFreq(); if(!this.isFinish()) { TermDocs td = te.getTermDocs(); if(df<=SKIP_MIN||this.uni.baseAdvanceDocs==null) { td.seek(te); this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum,true,maxDoc); } else { MixTermInfo cl = new MixTermInfo(df, termNum, tdreader, new Term(t.field(), t.text())); if (termDocslist.size() < limitsize) { termDocslist.add(cl); } else { MixTermInfo peek = termDocslist.peek(); if (UnInvertedFieldUtils.TD_CMP.compare(peek, cl) > 0 && cl.getCount() / (peek.getCount() + 1) > 1.5) { termDocslist.add(cl); MixTermInfo cl_old = termDocslist.poll(); this.set_Doc2TermNum_NonSkip(cl_old.getTd(), docs,freqs, cl_old.getTermNum(), true, maxDoc); } else { td.seek(te); this.set_Doc2TermNum_NonSkip(td, docs, freqs,termNum, true, maxDoc); } } } } maxTermNum=Math.max(maxTermNum, termNum); this.setTermNumValue(t, termNum); te.next(); } this.PriorityQueue_skip_set(termDocslist, maxDoc,true); this.uni.endRamDocValue(true,maxTermNum); tdreader.close(); te.close(); this.uni.tnr = this.uni.ramDocValue.getDocReader(); this.setTdIndex_NULL(); } public void setTdIndex_NULL() throws IOException { if (this.uni.baseAdvanceDocs == null) { return; } log.info("setTdIndex_NULL :"+this.uni.baseAdvanceDocs.size()); DocIterator iter = this.uni.baseAdvanceDocs.iterator(); while (iter.hasNext()) { int doc = iter.nextDoc(); this.uni.bits.add(doc); this.uni.markDocTm(doc, this.uni.getNullTm(),false); } this.uni.baseAdvanceDocs=null; } public void addDoclist(BitDocSet baseAdvanceDocs,String field, IndexReader reader) throws IOException { if (uni.checkEmpty()) { return ; } BitDocSet tmp=null; if(baseAdvanceDocs!=null) { tmp=(BitDocSet) baseAdvanceDocs.andNot(this.uni.bits); if(tmp!=null&&tmp.size()<=0) { return ; } } this.uni.baseAdvanceDocs=UnInvertedField.ajustBase(48,tmp, reader); TermNumEnumerator te = uni.ti.getEnumerator(reader); log.info("addDoclist start " + this.uni.field +",baseAdvanceDocs="+(this.uni.baseAdvanceDocs==null?"null":this.uni.baseAdvanceDocs.size())+"@"+(baseAdvanceDocs==null?"null":baseAdvanceDocs.size())); int maxDoc=reader.maxDoc(); int limitsize=this.getLimitSize(); PriorityQueue<MixTermInfo> termDocslist=new PriorityQueue<MixTermInfo>(limitsize,Collections.reverseOrder(UnInvertedFieldUtils.TD_CMP)); TermDocs tdreader=reader.termDocs(1024); for (;;) { Term t = te.term(); if (t == null) { break; } if (this.isFinish()) { break; } int termNum = te.getTermNumber(); if (termNum%10000==0) { log.info("termsInverted " +termNum+"@"+ this.uni.field + ",limitsize=" + limitsize); } TermDocs td = te.getTermDocs(); int df = te.docFreq(); if (df <= SKIP_MIN || this.uni.baseAdvanceDocs == null) { td.seek(te); this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum, false, maxDoc); } else { MixTermInfo cl = new MixTermInfo(df, termNum, tdreader, new Term(t.field(), t.text())); if (termDocslist.size() < limitsize) { termDocslist.add(cl); } else { MixTermInfo peek = termDocslist.peek(); if (UnInvertedFieldUtils.TD_CMP.compare(peek, cl) > 0 && cl.getCount() / (peek.getCount() + 1) > 1.5) { termDocslist.add(cl); MixTermInfo cl_old = termDocslist.poll(); this.set_Doc2TermNum_NonSkip(cl_old.getTd(), docs, freqs, cl_old.getTermNum(), false, maxDoc); } else { td.seek(te); this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum, false, maxDoc); } } } te.next(); } this.PriorityQueue_skip_set(termDocslist, maxDoc,false); te.close(); this.setTdIndex_NULL(); } private boolean isFinish() { if(this.uni.baseAdvanceDocs==null) { return false; } return this.uni.baseAdvanceDocs.size()<=0; } private void cleanBase(DocSetCollector collect) { DocSet docsit=collect.getDocSet(); DocIterator toremove = docsit.iterator(); while (toremove.hasNext()) { int doc = toremove.nextDoc(); this.uni.baseAdvanceDocs.clear(doc); } } private int getLimitSize() { int limitsize=TD_LIMIT_MAX; if(this.uni.baseAdvanceDocs!=null) { limitsize=(int) Math.min(limitsize, TD_MAX_CMP_COUNT/(1+this.uni.baseAdvanceDocs.size())); } limitsize=Math.max(limitsize, TD_LIMIT_MIN); return limitsize; } private void set_Doc2TermNum_NonSkip(TermDocs td,int[] docs,int[] freqs,int termNum,boolean isinit,int maxDoc) throws IOException { if(this.uni.baseAdvanceDocs==null) { for (;;) { int n = td.read(docs, freqs); if (n <= 0) { break; } for (int i = 0; i < n; i++) { int docid=docs[i]; this.uni.bits.add(docid); this.uni.markDocTm(docid, termNum, isinit); } } return ; } DocSetCollector collect=new DocSetCollector(10240, maxDoc); for (;;) { int n = td.read(docs, freqs); if (n <= 0) { break; } for (int i = 0; i < n; i++) { int docid=docs[i]; collect.collect(docid); this.uni.bits.add(docid); this.uni.markDocTm(docid, termNum, isinit); } } this.cleanBase(collect); } private int set_Doc2TermNum_Skip(TermDocs td,int[] docs,int[] freqs,int termNum,boolean isinit,int maxDoc) throws IOException { int skipcount=0; DocSetCollector collect=new DocSetCollector(10240, maxDoc); DocIterator iter = this.uni.baseAdvanceDocs.iterator(); int doc=-1; int baseDoc=-1; while (iter.hasNext()) { doc = iter.nextDoc(); if(doc<baseDoc) { continue; } if(doc>baseDoc) { int diff=doc-baseDoc; if(diff>=SKIP_STEP) { if(baseDoc>=0) { skipcount++; } baseDoc=UnInvertedFieldUtils.advance(td, doc); if(UnInvertedFieldUtils.NO_MORE_DOCS==baseDoc) { break; } }else{ boolean is_no_more_docs=false; for(int i=0;i<=SKIP_STEP;i++) { if(td.next()) { baseDoc=td.doc(); if(baseDoc>=doc) { break; } collect.collect(baseDoc); this.uni.bits.add(baseDoc); this.uni.markDocTm(baseDoc, termNum, isinit); }else{ is_no_more_docs=true; break; } } if(is_no_more_docs) { break; } } } if(baseDoc==doc) { this.uni.bits.add(doc); collect.collect(doc); this.uni.markDocTm(doc, termNum, isinit); } } this.cleanBase(collect); return skipcount; } private int PriorityQueue_skip_set(PriorityQueue<MixTermInfo> termDocslist,int maxDoc,boolean isinit) throws IOException { int skipcount=0; ArrayList<MixTermInfo> sorted=new ArrayList<MixTermInfo>(termDocslist.size()); sorted.addAll(termDocslist); Collections.sort(sorted, UnInvertedFieldUtils.TD_CMP_TM); for(MixTermInfo cl:sorted) { if(this.isFinish()) { break; } if(skipcount>MAX_SKIP_COUNT) { this.set_Doc2TermNum_NonSkip(cl.getTd(), docs, freqs,cl.getTermNum(),isinit,maxDoc); }else{ skipcount+=this.set_Doc2TermNum_Skip(cl.getTd(), docs, freqs, cl.getTermNum(),isinit,maxDoc); } } return skipcount; } private void setTermNumValue(Term t,int termNum) { if (this.uni.fieldDataType == FieldDatatype.d_long) { this.uni.setTmValueLong(termNum,Long.parseLong(this.uni.ft.indexedToReadable(t.text()))) ; } else if (this.uni.fieldDataType == FieldDatatype.d_double) { this.uni.setTmValueDouble(termNum, MdrillUtils.ParseDouble(this.uni.ft.indexedToReadable(t.text()))); } else if (this.uni.fieldDataType == FieldDatatype.d_string) {// for dist CRC32 crc32 = new CRC32(); crc32.update(new String(this.uni.ft.indexedToReadable(t.text())).getBytes()); this.uni.setTmValueLong(termNum, crc32.getValue()); } } }