package folioxml.lucene; import folioxml.config.*; import folioxml.core.InvalidMarkupException; import folioxml.core.TokenUtils; import folioxml.export.FileNode; import folioxml.export.InfobaseSetPlugin; import folioxml.export.LogStreamProvider; import folioxml.lucene.analysis.AnalyzerPicker; import folioxml.lucene.analysis.DynamicAnalyzer; import folioxml.slx.ISlxTokenReader; import folioxml.slx.SlxContextStack; import folioxml.slx.SlxRecord; import folioxml.slx.SlxToken; import folioxml.xml.XmlRecord; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Locale; public class InfobaseSetIndexer implements InfobaseSetPlugin, AnalyzerPicker { public InfobaseSetIndexer() { } InfobaseFieldOptsSet conf; IndexWriter w; @Override public void beginInfobaseSet(InfobaseSet set, ExportLocations export, LogStreamProvider logs) throws IOException { Path folder = export.getLocalPath("lucene_index", AssetType.LuceneIndex, FolderCreation.None); w = new IndexWriter(FSDirectory.open(folder), new IndexWriterConfig(new DynamicAnalyzer(this)).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); } @Override public void beginInfobase(InfobaseConfig infobase) { conf = null; currentInfobase = infobase; } InfobaseConfig currentInfobase; @Override public ISlxTokenReader wrapSlxReader(ISlxTokenReader reader) { return reader; } Document doc = null; @Override public void onSlxRecordParsed(SlxRecord r) throws InvalidMarkupException { boolean isRoot = r.isRootRecord(); //Create lucene document, add some fields. doc = new Document(); //Add level, groups, infobase if (r.getLevelType() == null) { doc.add(addNonTokenizedField("level", "Normal")); } else { doc.add(addNonTokenizedField("level", r.getLevelType())); } doc.add(addAnalyzedField("groups", r.get("groups"))); doc.add(addNonTokenizedField("infobase", currentInfobase.getId())); if (!isRoot) { //Iterate all tokens and stream to applicable fields so a query can be evaluated later FieldCollector coll = new FieldCollector(doc, conf); StringBuilder contentSb = new StringBuilder(); SlxContextStack stack = new SlxContextStack(false, false); List<String> destinations = new ArrayList<String>(); stack.process(r); String spacing = TokenUtils.entityDecodeString("   "); for (SlxToken t : r.getTokens()) { stack.process(t);// call this on each token. //Hidden to indexing, not to view. This is totally separate from what ExportHiddenText does. boolean hidden = coll.collect(t, stack, r); if (!hidden && t.isTextOrEntity()) { //Changed dec 17 to include whitespace... was causing indexing errors.. fields separated by whitespace were being joined. String s = t.markup; if (t.isEntity()) s = TokenUtils.entityDecodeString(s); contentSb.append(s); } if (t.matches("p|br|td|th|note") && !t.isOpening()) { contentSb.append(spacing); } if (t.isTag() && t.matches("bookmark")) { //Add bookmarks as-is doc.add(new StringField("destinations", t.get("name"), Field.Store.YES)); } } doc.add(new TextField(conf.getDefaultField(), contentSb.toString(), Field.Store.YES)); String folioSectionHeading = TokenUtils.entityDecodeString(r.getFullHeading(",", false, 20)).trim(); doc.add(new TextField("folioSectionHeading", folioSectionHeading, Field.Store.YES)); doc.add(new StoredField("title", r.getFullHeading(" - ", true, 2))); doc.add(new StoredField("heading", r.get("heading"))); coll.flush(); } } @Override public FileNode assignFileNode(XmlRecord xr, SlxRecord dirty_slx) throws InvalidMarkupException, IOException { return null; } @Override public void onRecordComplete(XmlRecord xr, FileNode file) throws InvalidMarkupException, IOException { //Add URI if (xr.get("uri") != null) doc.add(new StoredField("uri", xr.get("uri"))); //Add String relative_path = file.getAttributes().get("relative_path"); String uri_fragment = file.getAttributes().get("uri_fragment"); if (relative_path == null || uri_fragment == null) { throw new InvalidMarkupException("Both relative_path and uri_fragment must be defined on the FileNode for indexing"); } doc.add(new StoredField("relative_path", relative_path)); doc.add(new StoredField("uri_fragment", uri_fragment)); if (xr.isRootRecord()) { //Configure field indexing based on the .DEF file. conf = new InfobaseFieldOptsSet(xr); doc.add(new StoredField("xml", xr.toXmlString(false))); } w.addDocument(doc); } @Override public void onRecordTransformed(XmlRecord r, SlxRecord dirty_slx) throws InvalidMarkupException, IOException { } @Override public void endInfobase(InfobaseConfig infobase) { } @Override public void endInfobaseSet(InfobaseSet set) throws IOException { try { w.commit(); } finally { w.close(); } } private Field addNonTokenizedField(String name, String value) { return new StringField(name, value.toLowerCase(Locale.ENGLISH).trim(), Field.Store.YES); } private Field addAnalyzedField(String name, String value) { if (value == null) value = ""; //Some records have no groups... causing null return new TextField(name, value, Field.Store.YES); } @Override public Analyzer getAnalyzer(String fieldName) { return conf.getAnalyzer(fieldName); } }