package folioxml.lucene.folioQueryParser; import folioxml.core.InvalidMarkupException; import folioxml.core.TokenUtils; import folioxml.lucene.folioQueryParser.QueryToken.TokenType; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.*; import java.io.IOException; import java.io.StringReader; import java.util.List; import java.util.Locale; /* * Supports field and group searches. Boolean Does not support wild */ public class QueryParser { public QueryParser(Analyzer analyzer, String defaultField) { this.analyzer = analyzer; this.defaultField = defaultField; } public Query parse(String s) throws IOException, InvalidMarkupException { return parse(new QueryTokenReader(s)); } public Query parse(QueryTokenReader r) throws IOException, InvalidMarkupException { return parse(r.readAll()); } public Query parse(List<QueryToken> tokens) throws InvalidMarkupException, IOException { QueryToken t = new QueryToken(TokenType.None, ""); t.children = tokens; t.ParseChildrenIntoTree(); return Convert(t); } Analyzer analyzer; String defaultField = "contents"; protected Query Convert(QueryToken t) throws InvalidMarkupException, IOException { if (t.type == TokenType.None || t.type == TokenType.OpenGroup) { if (t.children == null || t.children.size() == 0) return null; if (t.children.size() == 1) return Convert(t.children.get(0)); //Otherwise, make a boolean query. BooleanQuery.Builder q = new BooleanQuery.Builder(); for (int i = 0; i < t.children.size(); i++) { Query c = Convert(t.children.get(i)); if (c != null) q.add(c, Occur.MUST); } if (q.build().clauses().size() > 0) return q.build(); else return null; } if (t.type == TokenType.OpenField) { //Parse the field type and name out. String type = t.headers.get(0).text; if (TokenUtils.fastMatches("headings|partition|rank|weight|server", type)) throw new InvalidMarkupException("Support for [" + type + " ...] in queries is not yet implemented.", t.headers.get(0)); if (TokenUtils.fastMatches("contents|field|group|highlighter|level|popup|note", type)) { t.headers.remove(0); } else { type = "Field"; } if (TokenUtils.fastMatches("contents|field|highlighter|level|popup|note", type)) { //Concatenate the text from the tokens to find the field/highlighter/level/popup/note 'field name' to search on. String header = TokenUtils.fastMatches("popup|note", type) ? (type + "s") : ""; for (int i = 0; i < t.headers.size(); i++) { QueryToken h = t.headers.get(i); header += h.text; if (h.children != null && h.children.size() > 0) throw new InvalidMarkupException("Invalid character in field, level, or highlighter name - #, @ or /"); } //So, now we have a field name. Pass it down to all children so they can be created properly. if (!TokenUtils.fastMatches("level|contents", type)) t.setFieldNameRecursive(header.trim()); //Piggyback off the () query creation QueryToken n = new QueryToken(TokenType.OpenGroup, "("); n.children = t.children; Query q = Convert(n); if (!TokenUtils.fastMatches("level|contents", type)) { //Now, we have to do something special if there are no children. if (q == null) return new PrefixQuery(new Term(t.fieldName, "*")); else return q; } else if (TokenUtils.fastMatches("level", type)) { //Levelqueries are really an AND query, they don't change the field name. if (q == null) return null; BooleanQuery.Builder bq = new BooleanQuery.Builder(); bq.add(q, Occur.MUST); bq.add(new TermQuery(new Term("level", header.trim())), Occur.MUST); return bq.build(); } else if (TokenUtils.fastMatches("contents", type)) { //We need to drop apostrophes around headings //contents queries don't change the field name. TermQuery tocQuery = new TermQuery(new Term("folioSectionHeading", header.replace("'", "").trim().toLowerCase(Locale.ENGLISH))); if (q == null) return tocQuery; BooleanQuery.Builder bq = new BooleanQuery.Builder(); bq.add(q, Occur.MUST); bq.add(tocQuery, Occur.MUST); return bq.build(); } } else if (TokenUtils.fastMatches("group", type)) { t.setFieldNameRecursive("groups"); String header = ""; for (int i = 0; i < t.headers.size(); i++) { QueryToken h = t.headers.get(i); header += h.text; if (h.children != null && h.children.size() > 0) throw new InvalidMarkupException("Invalid character in field header - #, @ or /"); } return parseSimpleQuery("groups", header.trim()); } } if (t.type == TokenType.Term) { //+ - && || ! ( ) { } [ ] ^ " ~ * ? : \ return parseSimpleQuery(t.fieldName != null ? t.fieldName : defaultField, t.text); } if (t.type == TokenType.TermSuffix) { //For proximity searches //TODO: Implement proximity searches return Convert(t.children.get(0)); } if (t.type == TokenType.Not) { Query c = Convert(t.children.get(0)); if (c == null) return null; BooleanQuery.Builder q = new BooleanQuery.Builder(); q.add(c, Occur.MUST_NOT); return q.build(); } if (t.type == TokenType.Or) { BooleanQuery.Builder q = new BooleanQuery.Builder(); for (int i = 0; i < t.children.size(); i++) { Query c = Convert(t.children.get(i)); if (c != null) q.add(c, Occur.SHOULD); } if (q.build().clauses().size() > 0) return q.build(); else return null; } if (t.type == TokenType.Xor) { Query c1 = Convert(t.children.get(0)); Query c2 = Convert(t.children.get(1)); if (c1 == null && c2 != null) return c2; if (c1 != null && c2 == null) return c1; if (c1 == null && c2 == null) return null; BooleanQuery.Builder qa = new BooleanQuery.Builder(); qa.add(c1, Occur.MUST); qa.add(c2, Occur.MUST_NOT); BooleanQuery.Builder qb = new BooleanQuery.Builder(); qb.add(c2, Occur.MUST); qb.add(c1, Occur.MUST_NOT); BooleanQuery.Builder q = new BooleanQuery.Builder(); q.add(qa.build(), Occur.SHOULD); q.add(qb.build(), Occur.SHOULD); return q.build(); } return null; } public Query parseSimpleQuery(String fieldName, String text) throws IOException { //Fix doubled single quotes, strip outer single quotes. if (text.startsWith("'") && (text.endsWith("'"))) { text = text.replace("''", "'"); //Fix doubled apostrohes text = text.substring(1, text.length() - 1); //Remove the apostrophes. } boolean phraseQuery = text.startsWith("\""); if (phraseQuery) text = text.substring(1, text.length() - 1); //Remove the quotes, we don't need them anymore. TokenStream s = analyzer.tokenStream(fieldName, new StringReader(text)); s.reset(); try { if (phraseQuery) { PhraseQuery.Builder q = new PhraseQuery.Builder(); while (s.incrementToken()) { String term = s.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 0) q.add(new Term(fieldName, term)); } return q.build(); } else { BooleanQuery.Builder q = new BooleanQuery.Builder(); while (s.incrementToken()) { String term = s.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 0) q.add(new TermQuery(new Term(fieldName, term)), Occur.MUST); } return q.build(); } } finally { s.end(); s.close(); } } }