package org.apache.lucene.index.codecs.simpletext;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
import org.apache.lucene.util.automaton.fst.PairOutputs;
import java.io.IOException;
import java.util.Comparator;
import java.util.Map;
import java.util.HashMap;
class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in;
private final FieldInfos fieldInfos;
final static byte NEWLINE = SimpleTextFieldsWriter.NEWLINE;
final static byte ESCAPE = SimpleTextFieldsWriter.ESCAPE;
final static BytesRef END = SimpleTextFieldsWriter.END;
final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD;
final static BytesRef TERM = SimpleTextFieldsWriter.TERM;
final static BytesRef DOC = SimpleTextFieldsWriter.DOC;
final static BytesRef POS = SimpleTextFieldsWriter.POS;
final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD;
public SimpleTextFieldsReader(SegmentReadState state) throws IOException {
in = state.dir.openInput(SimpleTextCodec.getPostingsFileName(state.segmentInfo.name, ""+state.codecId));
fieldInfos = state.fieldInfos;
}
static void readLine(IndexInput in, BytesRef scratch) throws IOException {
int upto = 0;
while(true) {
byte b = in.readByte();
if (scratch.bytes.length == upto) {
scratch.grow(1+upto);
}
if (b == ESCAPE) {
scratch.bytes[upto++] = in.readByte();
} else {
if (b == NEWLINE) {
break;
} else {
scratch.bytes[upto++] = b;
}
}
}
scratch.offset = 0;
scratch.length = upto;
}
private class SimpleTextFieldsEnum extends FieldsEnum {
private final IndexInput in;
private final BytesRef scratch = new BytesRef(10);
private String current;
public SimpleTextFieldsEnum() {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
}
@Override
public String next() throws IOException {
while(true) {
readLine(in, scratch);
if (scratch.equals(END)) {
current = null;
return null;
}
if (scratch.startsWith(FIELD)) {
String field = StringHelper.intern(new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"));
current = field;
return field;
}
}
}
@Override
public TermsEnum terms() throws IOException {
return SimpleTextFieldsReader.this.terms(current).iterator();
}
}
private class SimpleTextTermsEnum extends TermsEnum {
private final IndexInput in;
private final boolean omitTF;
private int docFreq;
private long totalTermFreq;
private long docsStart;
private boolean ended;
private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF;
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
}
@Override
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString());
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
//System.out.println(" got text=" + term.utf8ToString());
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
return SeekStatus.FOUND;
} else {
//System.out.println(" not match docsStart=" + docsStart);
return SeekStatus.NOT_FOUND;
}
}
}
@Override
public BytesRef next() throws IOException {
assert !ended;
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
if (result != null) {
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
return result.input;
} else {
return null;
}
}
@Override
public BytesRef term() {
return fstEnum.current().input;
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seek(long ord) {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() {
return docFreq;
}
@Override
public long totalTermFreq() {
return totalTermFreq;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum;
if (reuse != null && reuse instanceof SimpleTextDocsEnum && ((SimpleTextDocsEnum) reuse).canReuse(in)) {
docsEnum = (SimpleTextDocsEnum) reuse;
} else {
docsEnum = new SimpleTextDocsEnum();
}
return docsEnum.reset(docsStart, skipDocs, omitTF);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (omitTF) {
return null;
}
SimpleTextDocsAndPositionsEnum docsAndPositionsEnum;
if (reuse != null && reuse instanceof SimpleTextDocsAndPositionsEnum && ((SimpleTextDocsAndPositionsEnum) reuse).canReuse(in)) {
docsAndPositionsEnum = (SimpleTextDocsAndPositionsEnum) reuse;
} else {
docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum();
}
return docsAndPositionsEnum.reset(docsStart, skipDocs);
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}
private class SimpleTextDocsEnum extends DocsEnum {
private final IndexInput inStart;
private final IndexInput in;
private boolean omitTF;
private int docID;
private int tf;
private Bits skipDocs;
private final BytesRef scratch = new BytesRef(10);
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
public SimpleTextDocsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
this.in = (IndexInput) this.inStart.clone();
}
public boolean canReuse(IndexInput in) {
return in == inStart;
}
public SimpleTextDocsEnum reset(long fp, Bits skipDocs, boolean omitTF) throws IOException {
this.skipDocs = skipDocs;
in.seek(fp);
this.omitTF = omitTF;
if (omitTF) {
tf = 1;
}
return this;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return tf;
}
@Override
public int nextDoc() throws IOException {
if (docID == NO_MORE_DOCS) {
return docID;
}
boolean first = true;
int termFreq = 0;
while(true) {
final long lineStart = in.getFilePointer();
readLine(in, scratch);
if (scratch.startsWith(DOC)) {
if (!first && (skipDocs == null || !skipDocs.get(docID))) {
in.seek(lineStart);
if (!omitTF) {
tf = termFreq;
}
return docID;
}
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
termFreq = 0;
first = false;
} else if (scratch.startsWith(POS)) {
termFreq++;
} else if (scratch.startsWith(PAYLOAD)) {
// skip
} else {
assert scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.startsWith(END): "scratch=" + scratch.utf8ToString();
if (!first && (skipDocs == null || !skipDocs.get(docID))) {
in.seek(lineStart);
if (!omitTF) {
tf = termFreq;
}
return docID;
}
return docID = NO_MORE_DOCS;
}
}
}
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
while(nextDoc() < target);
return docID;
}
}
private class SimpleTextDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final IndexInput inStart;
private final IndexInput in;
private int docID;
private int tf;
private Bits skipDocs;
private final BytesRef scratch = new BytesRef(10);
private final BytesRef scratch2 = new BytesRef(10);
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
private final UnicodeUtil.UTF16Result scratchUTF16_2 = new UnicodeUtil.UTF16Result();
private BytesRef payload;
private long nextDocStart;
public SimpleTextDocsAndPositionsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
this.in = (IndexInput) inStart.clone();
}
public boolean canReuse(IndexInput in) {
return in == inStart;
}
public SimpleTextDocsAndPositionsEnum reset(long fp, Bits skipDocs) {
this.skipDocs = skipDocs;
nextDocStart = fp;
return this;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return tf;
}
@Override
public int nextDoc() throws IOException {
boolean first = true;
in.seek(nextDocStart);
long posStart = 0;
while(true) {
final long lineStart = in.getFilePointer();
readLine(in, scratch);
if (scratch.startsWith(DOC)) {
if (!first && (skipDocs == null || !skipDocs.get(docID))) {
nextDocStart = lineStart;
in.seek(posStart);
return docID;
}
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
tf = 0;
posStart = in.getFilePointer();
first = false;
} else if (scratch.startsWith(POS)) {
tf++;
} else if (scratch.startsWith(PAYLOAD)) {
// skip
} else {
assert scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.startsWith(END);
if (!first && (skipDocs == null || !skipDocs.get(docID))) {
nextDocStart = lineStart;
in.seek(posStart);
return docID;
}
return docID = NO_MORE_DOCS;
}
}
}
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
while(nextDoc() < target);
return docID;
}
@Override
public int nextPosition() throws IOException {
readLine(in, scratch);
assert scratch.startsWith(POS): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
final int pos = ArrayUtil.parseInt(scratchUTF16_2.result, 0, scratchUTF16_2.length);
final long fp = in.getFilePointer();
readLine(in, scratch);
if (scratch.startsWith(PAYLOAD)) {
final int len = scratch.length - PAYLOAD.length;
if (scratch2.bytes.length < len) {
scratch2.grow(len);
}
System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
scratch2.length = len;
payload = scratch2;
} else {
payload = null;
in.seek(fp);
}
return pos;
}
@Override
public BytesRef getPayload() {
// Some tests rely on only being able to retrieve the
// payload once
try {
return payload;
} finally {
payload = null;
}
}
@Override
public boolean hasPayload() {
return payload != null;
}
}
static class TermData {
public long docsStart;
public int docFreq;
public TermData(long docsStart, int docFreq) {
this.docsStart = docsStart;
this.docFreq = docFreq;
}
}
private class SimpleTextTerms extends Terms {
private final long termsStart;
private final boolean omitTF;
private long sumTotalTermFreq;
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
private int termCount;
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
this.termsStart = termsStart;
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
loadTerms();
}
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
0,
0,
true,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
sumTotalTermFreq += totalTermFreq;
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
} else if (scratch.startsWith(POS)) {
totalTermFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
if (len > lastTerm.length) {
lastTerm.grow(len);
}
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
sumTotalTermFreq += totalTermFreq;
totalTermFreq = 0;
termCount++;
}
}
fst = b.finish();
/*
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);
ps.close();
System.out.println("SAVED out.dot");
*/
//System.out.println("FST " + fst.sizeInBytes());
}
@Override
public TermsEnum iterator() throws IOException {
if (fst != null) {
return new SimpleTextTermsEnum(fst, omitTF);
} else {
return TermsEnum.EMPTY;
}
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public long getUniqueTermCount() {
return (long) termCount;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
}
@Override
public FieldsEnum iterator() throws IOException {
return new SimpleTextFieldsEnum();
}
private final Map<String,Terms> termsCache = new HashMap<String,Terms>();
@Override
synchronized public Terms terms(String field) throws IOException {
Terms terms = termsCache.get(field);
if (terms == null) {
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
String fieldUpto;
while((fieldUpto = fe.next()) != null) {
if (fieldUpto.equals(field)) {
terms = new SimpleTextTerms(field, fe.in.getFilePointer());
break;
}
}
termsCache.put(field, terms);
}
return terms;
}
@Override
public void loadTermsIndex(int indexDivisor) {
}
@Override
public void close() throws IOException {
in.close();
}
}