package com.senseidb.clue.commands; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.TreeMap; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import com.senseidb.clue.ClueContext; public class ReconstructCommand extends ClueCommand { public ReconstructCommand(ClueContext ctx) { super(ctx); } @Override public String getName() { return "reconstruct"; } @Override public String help() { return "reconstructs an indexed field for a document"; } public String reconstructWithPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{ TreeMap<Integer,List<String>> docTextMap = new TreeMap<Integer,List<String>>(); BytesRef text; PostingsEnum postings = null; while ((text = te.next()) != null) { postings = te.postings(postings, PostingsEnum.FREQS | PostingsEnum.POSITIONS); int iterDoc = postings.advance(docid); if (iterDoc == docid) { int freq = postings.freq(); for (int i = 0; i < freq; ++i) { int pos = postings.nextPosition(); List<String> textList = docTextMap.get(pos); if (textList == null) { textList = new ArrayList<String>(); docTextMap.put(pos, textList); } textList.add(text.utf8ToString()); } } } StringBuilder buf = new StringBuilder(); for (Entry<Integer, List<String>> entry : docTextMap.entrySet()) { Integer pos = entry.getKey(); List<String> terms = entry.getValue(); for (String term : terms) { buf.append(term+"("+pos+") "); } } return buf.toString(); } public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{ List<String> textList = new ArrayList<String>(); BytesRef text; PostingsEnum postings = null; while ((text = te.next()) != null) { postings = te.postings(postings, PostingsEnum.FREQS); int iterDoc = postings.advance(docid); if (iterDoc == docid) { textList.add(text.utf8ToString()); } } StringBuilder buf = new StringBuilder(); for (String s : textList) { buf.append(s+" "); } return buf.toString(); } @Override public void execute(String[] args, PrintStream out) throws Exception { if (args.length != 2) { out.println("usage: field doc"); return; } String field = args[0]; int doc = Integer.parseInt(args[1]); IndexReader reader = ctx.getIndexReader(); List<LeafReaderContext> leaves = reader.leaves(); boolean found = false; for (LeafReaderContext ctx : leaves) { LeafReader atomicReader = ctx.reader(); FieldInfo finfo = atomicReader.getFieldInfos().fieldInfo(field); if (finfo == null) continue; // if (!finfo.isIndexed()) { // out.println(field+" is not an indexed field"); // return; // } int docID = doc - ctx.docBase; if (docID >= 0) { Terms terms = atomicReader.terms(field); boolean hasPositions = terms.hasPositions(); TermsEnum te = terms.iterator(); if (hasPositions) { out.println(reconstructWithPositions(te, docID, atomicReader.getLiveDocs())); } else { out.println(reconstructNoPositions(te, docID, atomicReader.getLiveDocs())); } found = true; break; } } if (!found) { out.println(doc + " not found"); return; } } }