package org.xbib.elasticsearch.skywalker.reconstruct; import org.apache.lucene.document.Document; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.CompositeReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.xbib.elasticsearch.action.skywalker.support.IndexableFieldToXContent; import java.io.IOException; import java.util.ArrayList; import java.util.List; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; /** * This class attempts to reconstruct all fields from a document existing in a * Lucene index. This operation may be (and usually) is lossy - e.g. unstored * fields are rebuilt from terms present in the index, and these terms may have * been changed (e.g. lowercased, stemmed), and many other input tokens may have * been skipped altogether by the Analyzer, when fields were originally added to * the index. * */ public class DocumentReconstructor { private AtomicReader reader; /** * Prepare a document reconstructor. * * @param indexReader IndexReader to read from. * @throws Exception */ public DocumentReconstructor(IndexReader indexReader) { if (indexReader == null) { throw new ElasticsearchIllegalArgumentException("reader cannot be null"); } try { if (indexReader instanceof CompositeReader) { this.reader = SlowCompositeReaderWrapper.wrap(indexReader); } else if (indexReader instanceof AtomicReader) { this.reader = (AtomicReader) indexReader; } else { throw new ElasticsearchIllegalArgumentException("unsupported IndexReader class " + indexReader.getClass().getName()); } } catch (IOException e) { throw new ElasticsearchIllegalArgumentException(e.getMessage()); } } /** * Reconstruct an index shard * * @return reconstructed document * @throws Exception */ public XContentBuilder reconstruct(int shardId) throws IOException { XContentBuilder builder = jsonBuilder(); builder.startObject() .field("shardId", shardId) .field("numDeletions", reader.numDeletedDocs()); builder.startArray("docs"); FieldInfos fieldInfos = reader.getFieldInfos(); Bits live = MultiFields.getLiveDocs(reader); for (int docNum = 0; docNum < reader.maxDoc(); docNum++) { Document doc = reader.document(docNum); if (live != null && live.get(docNum)) { continue; // not deleted } builder.startObject().startArray("fields"); if (fieldInfos != null) { for (FieldInfo fi : fieldInfos) { String name = fi.name; IndexableField[] fs = doc.getFields(name); if (fs != null && fs.length > 0) { for (IndexableField f : fs) { IndexableFieldToXContent x = new IndexableFieldToXContent().field(f); x.toXContent(builder, ToXContent.EMPTY_PARAMS); } } } } builder.endArray(); builder.startArray("terms"); if (fieldInfos != null) { TermsEnum te = null; DocsAndPositionsEnum dpe = null; for (FieldInfo fi : fieldInfos) { Terms terms = MultiFields.getTerms(reader, fi.name); if (terms == null) { // no terms in this field continue; } te = terms.iterator(te); while (te.next() != null) { DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0); if (newDpe == null) { // no position info for this field break; } dpe = newDpe; int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String text = te.term().utf8ToString(); List<Integer> positions = new ArrayList(); List<Integer> starts = new ArrayList(); List<Integer> ends = new ArrayList(); for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); positions.add(pos); starts.add(dpe.startOffset()); ends.add(dpe.endOffset()); } builder.startObject() .field("text", text) .field("positions", positions) .field("starts", starts) .field("ends", ends) .field("count", dpe.freq()) .endObject(); } } } builder.endArray(); builder.endObject(); } builder.endArray(); builder.endObject(); return builder; } }