/* * eXist Open Source Native XML Database * Copyright (C) 2001-2015 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ package org.exist.indexing.lucene; import java.io.IOException; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.*; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.exist.dom.QName; import org.exist.dom.persistent.SymbolTable; import org.exist.numbering.NodeId; import org.exist.storage.BrokerPool; import org.exist.util.ByteConversion; import javax.xml.XMLConstants; public class LuceneUtil { public static final String FIELD_NODE_ID = "nodeId"; public static final String FIELD_DOC_ID = "docId"; public static final String FIELD_DOC_URI = "docUri"; public static byte[] createId(int docId, NodeId nodeId) { // build id from nodeId and docId byte[] data = new byte[nodeId.size() + 4]; ByteConversion.intToByteH(docId, data, 0); nodeId.serialize(data, 4); return data; } public static byte[] createId(NodeId nodeId) { byte[] data = new byte[nodeId.size()]; nodeId.serialize(data, 0); return data; } public static NodeId readNodeId(int doc, BinaryDocValues nodeIdValues, BrokerPool pool) { final BytesRef ref = nodeIdValues.get(doc); final int units = ByteConversion.byteToShort(ref.bytes, ref.offset); return pool.getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2); } /** * Encode an element or attribute qname into a lucene field name using the * internal ids for namespace and local name. * * @param qname * @return encoded qname */ public static String encodeQName(QName qname, SymbolTable symbols) { short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI()); short localNameId = symbols.getSymbol(qname.getLocalPart()); long nameId = qname.getNameType() | (namespaceId & 0xFFFF) << 16 | (localNameId & 0xFFFFFFFFL) << 32; return Long.toHexString(nameId); } /** * Decode the lucene field name into an element or attribute qname. * * @param s * @return the qname */ public static QName decodeQName(String s, SymbolTable symbols) { try { long l = Long.parseLong(s, 16); short namespaceId = (short) ((l >>> 16) & 0xFFFFL); short localNameId = (short) ((l >>> 32) & 0xFFFFL); byte type = (byte) (l & 0xFFL); String namespaceURI = symbols.getNamespace(namespaceId); String localName = symbols.getName(localNameId); return new QName(localName, namespaceURI, XMLConstants.DEFAULT_NS_PREFIX, type); } catch (NumberFormatException e) { return null; } } public static String[] extractFields(Query query, IndexReader reader) throws IOException { Map<Object, Query> map = new TreeMap<>(); extractTerms(query, map, reader, true); Set<String> fields = new TreeSet<>(); for (Object term : map.keySet()) { fields.add(((Term)term).field()); } String[] fieldArray = new String[fields.size()]; return fields.toArray(fieldArray); } /** * Extract all terms which would be matched by a given query. * The terms are put into a map with the term as key and the * corresponding query object as value. * * This method is used by {@link LuceneMatchListener} * to highlight matches in the search results. * * @param query * @param terms * @throws IOException in case of an error * @throws UnsupportedOperationException in case of an error */ public static void extractTerms(Query query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException, UnsupportedOperationException { if (query instanceof BooleanQuery) extractTermsFromBoolean((BooleanQuery)query, terms, reader, includeFields); else if (query instanceof TermQuery) extractTermsFromTerm((TermQuery) query, terms, includeFields); else if (query instanceof WildcardQuery) extractTermsFromWildcard((WildcardQuery) query,terms, reader, includeFields); else if (query instanceof RegexpQuery) extractTermsFromRegex((RegexpQuery) query, terms, reader, includeFields); else if (query instanceof FuzzyQuery) extractTermsFromFuzzy((FuzzyQuery) query, terms, reader, includeFields); else if (query instanceof PrefixQuery) extractTermsFromPrefix((PrefixQuery) query, terms, reader, includeFields); else if (query instanceof PhraseQuery) extractTermsFromPhrase((PhraseQuery) query, terms, includeFields); else { // fallback to Lucene's Query.extractTerms if none of the // above matches Set<Term> tempSet = new TreeSet<>(); query.extractTerms(tempSet); for (Term t : tempSet) { if (includeFields) terms.put(t, query); else terms.put(t.text(), query); } } } private static void extractTermsFromBoolean(BooleanQuery query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException { BooleanClause clauses[] = query.getClauses(); for (BooleanClause clause : clauses) { extractTerms(clause.getQuery(), terms, reader, includeFields); } } private static void extractTermsFromTerm(TermQuery query, Map<Object, Query> terms, boolean includeFields) { if (includeFields) terms.put(query.getTerm(), query); else terms.put(query.getTerm().text(), query); } private static void extractTermsFromWildcard(WildcardQuery query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException { extractTermsFromMultiTerm(query, terms, reader, includeFields); } private static void extractTermsFromRegex(RegexpQuery query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException { extractTermsFromMultiTerm(query, terms, reader, includeFields); } private static void extractTermsFromFuzzy(FuzzyQuery query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException { extractTermsFromMultiTerm(query, terms, reader, includeFields); } private static void extractTermsFromPrefix(PrefixQuery query, Map<Object, Query> terms, IndexReader reader, boolean includeFields) throws IOException { extractTermsFromMultiTerm(query, terms, reader, includeFields); } private static void extractTermsFromPhrase(PhraseQuery query, Map<Object, Query> terms, boolean includeFields) { Term[] t = query.getTerms(); for (Term t1 : t) { if (includeFields) { terms.put(t1, query); } else { terms.put(t1.text(), query); } } } private static Query rewrite(MultiTermQuery query, IndexReader reader) throws IOException { query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); return query.rewrite(reader); } private static void extractTermsFromMultiTerm(MultiTermQuery query, Map<Object, Query> termsMap, IndexReader reader, boolean includeFields) throws IOException { TERM_EXTRACTOR.extractTerms(query, termsMap, reader, includeFields); } private static final MultiTermExtractor TERM_EXTRACTOR = new MultiTermExtractor(); /* * A class for extracting MultiTerms (all of them). * Subclassing MultiTermQuery.RewriteMethod * to gain access to its protected method getTermsEnum */ private static class MultiTermExtractor extends MultiTermQuery.RewriteMethod { public void extractTerms(MultiTermQuery query, Map<Object, Query> termsMap, IndexReader reader, boolean includeFields) throws IOException { IndexReaderContext topReaderContext = reader.getContext(); for (AtomicReaderContext context : topReaderContext.leaves()) { final Fields fields = context.reader().fields(); if (fields == null) { // reader has no fields continue; } final Terms terms = fields.terms(query.getField()); if (terms == null) { // field does not exist continue; } TermsEnum termsEnum = getTermsEnum(query, terms, new AttributeSource()); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) { continue; } BytesRef bytes; while ((bytes = termsEnum.next()) != null) { Term term = new Term(query.getField(), BytesRef.deepCopyOf(bytes)); if (includeFields) { termsMap.put(term, query); } else { termsMap.put(term.text(), query); } } } } @Override public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { throw new UnsupportedOperationException(); } }; }