package folioxml.lucene;
import folioxml.core.InvalidMarkupException;
import folioxml.core.TokenUtils;
import folioxml.slx.SlxContextStack;
import folioxml.slx.SlxRecord;
import folioxml.slx.SlxToken;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import java.util.List;
import java.util.TreeMap;
public class FieldCollector {
public FieldCollector(Document d, IndexFieldOptsProvider c) {
this.d = d;
this.c = c;
}
public Document d;
public IndexFieldOptsProvider c;
public TreeMap<String, StringBuilder> fields = new TreeMap<String, StringBuilder>(String.CASE_INSENSITIVE_ORDER);
public TreeMap<String, Boolean> unflushed = new TreeMap<String, Boolean>(String.CASE_INSENSITIVE_ORDER);
protected void send(String field, String text) {
if (!fields.containsKey(field)) fields.put(field, new StringBuilder());
StringBuilder sb = fields.get(field);
sb.append(text);
}
protected void flush(String field, boolean removeFromUnflushed) {
//Adds the field to the document if it has more than 0 chars.
if (removeFromUnflushed && unflushed.containsKey(field)) unflushed.remove(field);
if (fields.containsKey(field)) {
StringBuilder sb = fields.get(field);
if (sb != null && sb.length() > 0) {
d.add(new TextField(field, sb.toString(), Field.Store.YES));
sb.setLength(0);
}
}
}
/*
* Call this after all the tokens have been processed.
*/
public void flush() {
//add all ghostTags to lucene
for (String key : fields.keySet()) {
flush(key, true);
}
}
/*
* Returns true if the specified token has been 'eaten' by a field and should not be added to the main text.
* Call this method after stack.process().
*/
public boolean collect(SlxToken t, SlxContextStack stack, SlxRecord r) throws InvalidMarkupException {
// For an opening field, if the field is in unflushed, and [mergefields], remove it from unflushed.
// For a closing field, and [mergefields], add to the unflushed set. If [!mergefields], then flush the field value.
if (t.matches("span")) {
String fname = t.get("type");
if (fname != null) {
IndexFieldOpts fopts = c.getFieldOptions(fname);
if (t.isOpening()) {
if (fopts.mergeTouchingApplications && unflushed.containsKey(fname))
unflushed.remove(fname);
} else if (t.isClosing()) {
if (fopts.mergeTouchingApplications)
unflushed.put(fname, true);
else
flush(fname, true);
}
}
}
//Certain types of tokens break all fields, and should cause all fields to flush.
//A closing paragraph token breaks all fields
//A br token indexes as whitespace.
//TODO: Test what td, th, and note do to phrase field terms..
//The rest of this function deals only with text
if (!t.isTextOrEntity()) return false;
//The topmost item in the stack rules. It gets to determine whether the text is hidden or not.
//1 Search for the topmost span that is flagging the node as hidden.
String fieldHidingStuff = null;
List<SlxToken> tags = stack.getOpenTags("span", false, false); //Must not be returning all the span tags... maybe a different query needed
for (SlxToken g : tags) {
String fname = g.get("type");
if (fname == null) continue;
IndexFieldOpts opts = c.getFieldOptions(fname);
if (!opts.allowOthersToIndex) {
fieldHidingStuff = fname;
break;
}
}
//Decode entities.
String text = t.markup;
if (t.isEntity()) text = TokenUtils.entityDecodeString(text);
if (fieldHidingStuff != null) {
send(fieldHidingStuff, text);
return true;
}
//Send to each field on the stack
for (SlxToken g : tags) {
String fname = g.get("type");
if (fname == null) continue;
send(fname, text);
//Make sure nothing on the unflushed stack is in the slx stack... that way we can flush the unflushed afterwards
if (unflushed.containsKey(fname))
unflushed.remove(fname);
}
//We have a publicly visible token, and everything in 'unflushed' is NOT on the stack.
for (String unfl : unflushed.keySet()) {
flush(unfl, false);
}
unflushed.clear();
return false;
}
}