package com.yahoo.glimmer.query; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.di.big.mg4j.index.Index; import it.unimi.di.big.mg4j.index.IndexIterator; import it.unimi.di.big.mg4j.index.TermProcessor; import it.unimi.di.big.mg4j.query.nodes.AbstractQueryBuilderVisitor; import it.unimi.di.big.mg4j.query.nodes.Align; import it.unimi.di.big.mg4j.query.nodes.And; import it.unimi.di.big.mg4j.query.nodes.Annotation; import it.unimi.di.big.mg4j.query.nodes.Consecutive; import it.unimi.di.big.mg4j.query.nodes.Difference; import it.unimi.di.big.mg4j.query.nodes.False; import it.unimi.di.big.mg4j.query.nodes.LowPass; import it.unimi.di.big.mg4j.query.nodes.MultiTerm; import it.unimi.di.big.mg4j.query.nodes.Not; import it.unimi.di.big.mg4j.query.nodes.Or; import it.unimi.di.big.mg4j.query.nodes.OrderedAnd; import it.unimi.di.big.mg4j.query.nodes.Prefix; import it.unimi.di.big.mg4j.query.nodes.Query; import it.unimi.di.big.mg4j.query.nodes.QueryBuilderVisitor; import it.unimi.di.big.mg4j.query.nodes.QueryBuilderVisitorException; import it.unimi.di.big.mg4j.query.nodes.Range; import it.unimi.di.big.mg4j.query.nodes.Remap; import it.unimi.di.big.mg4j.query.nodes.Select; import it.unimi.di.big.mg4j.query.nodes.Term; import it.unimi.di.big.mg4j.query.nodes.True; import it.unimi.di.big.mg4j.query.nodes.Weight; import it.unimi.di.big.mg4j.query.parser.QueryParser; import it.unimi.di.big.mg4j.query.parser.QueryParserException; import it.unimi.di.big.mg4j.query.parser.SimpleParser; import it.unimi.di.big.mg4j.search.DocumentIterator; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.lang.MutableString; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; public class RDFQueryParser implements QueryParser { private final static Logger LOGGER = Logger.getLogger(RDFQueryParser.class); private SimpleParser parser; private Index alignmentIndex; private List<String> properties; private Map<String,String> fieldNameSuffixToFieldNameMap; private Set<String> fieldNamesSet; private String defaultField; private Map<String, ? extends TermProcessor> termProcessors; private Object2LongFunction<CharSequence> resourcesMap; private final static Pattern RESOURCE_PATTERN = Pattern.compile("\\((http://.*)\\)"); private final static Pattern FIELD_NAME_PATTEN = Pattern.compile("(\\w+):"); public RDFQueryParser(Index alignmentIndex, List<String> properties, Map<String,String> fieldNameSuffixToFieldNameMap, String defaultField, final Map<String, ? extends TermProcessor> termProcessors, final Object2LongFunction<CharSequence> resourcesMap) { init(alignmentIndex, properties, new HashSet<String>(fieldNameSuffixToFieldNameMap.values()), fieldNameSuffixToFieldNameMap, defaultField, termProcessors, resourcesMap); } protected void init(Index alignmentIndex, List<String> properties, Set<String> fieldNamesSet, Map<String,String> fieldNameSuffixToFieldNameMap, String defaultField, final Map<String, ? extends TermProcessor> termProcessors, final Object2LongFunction<CharSequence> resourcesMap) { this.alignmentIndex = alignmentIndex; this.properties = properties; this.fieldNameSuffixToFieldNameMap = fieldNameSuffixToFieldNameMap; this.fieldNamesSet = fieldNamesSet; this.defaultField = defaultField; this.termProcessors = termProcessors; this.resourcesMap = resourcesMap; parser = new SimpleParser(fieldNamesSet, defaultField, termProcessors); } public final static String cleanQuery(String query) { // Replace a dotted word such as www.yahoo.com with a phrase query // without dots String[] parts = query.split("[ ]+"); String yahooQuery = ""; for (String part : parts) { if (part.contains(".")) { if (part.contains("\"")) { // The query is already a phrase query part = part.replaceAll("\\.", " "); } else { // Make it a phrase query part = part.replaceAll("\\.", " "); if (!part.trim().equals("")) part = "\"" + part + "\""; } } yahooQuery += part + " "; } StringBuffer result = new StringBuffer(); for (int i = 0; i < yahooQuery.length(); i++) { if (!Character.isLetterOrDigit(yahooQuery.charAt(i)) && !(yahooQuery.charAt(i) == '"') && !(yahooQuery.charAt(i) == ':')) { result.append(" "); } else { result.append(yahooQuery.charAt(i)); } } // Normalize whitespace return result.toString().trim().toLowerCase().replaceAll("[\\s]+", " "); } @Override public Query parse(String unparsed) throws QueryParserException { if (unparsed == null || unparsed.equals("")) { throw new QueryParserException("Empty query"); } Query query = null; try { LOGGER.info("Unparsed query:" + unparsed); // Replace the short field names there corresponding field name. // name:tad -> http_xmlns_com_foaf_0_1_name:tad Matcher m = FIELD_NAME_PATTEN.matcher(unparsed); if (m.find()) { StringBuffer sb = new StringBuffer(); do { String fieldName = m.group(1); String fullFieldName = fieldNameSuffixToFieldNameMap.get(fieldName); if (fullFieldName != null) { m.appendReplacement(sb, fullFieldName); } else { m.appendReplacement(sb, fieldName); } sb.append(':'); } while (m.find()); m.appendTail(sb); unparsed = sb.toString(); } LOGGER.info("Query after feild name expansion:" + unparsed); m = RESOURCE_PATTERN.matcher(unparsed); if (m.find()) { StringBuffer sb = new StringBuffer(); do { m.appendReplacement(sb, Long.toString(resourcesMap.get(m.group(1)))); } while (m.find()); m.appendTail(sb); unparsed = sb.toString(); } LOGGER.info("Query after resource encoding:" + unparsed); query = parser.parse(unparsed); LOGGER.info("Query as parsed by MG4J:" + query); query = query.accept(new MyVisitor()); LOGGER.info("Query after expansion:" + query); } catch (QueryBuilderVisitorException e) { throw new QueryParserException(e); } return query; } @Override public String escape(String token) { return parser.escape(token); } @Override public MutableString escape(MutableString token) { return parser.escape(token); } @Override public Query parse(MutableString query) throws QueryParserException { return parse(query.toString()); } @Override public QueryParser copy() { return new RDFQueryParser(alignmentIndex, properties, fieldNameSuffixToFieldNameMap, defaultField, termProcessors, resourcesMap); } public class MyVisitor extends AbstractQueryBuilderVisitor<Query> { private boolean insideConsecutive = false; private boolean insideSelect = false; public Query[] newArray(int len) { return new Query[len]; } public QueryBuilderVisitor<Query> prepare() { return this; } public boolean visitPre(Consecutive node) throws QueryBuilderVisitorException { insideConsecutive = true; return true; } public boolean visitPre(Select node) throws QueryBuilderVisitorException { insideSelect = true; return true; } @Override public Query visit(Term term) throws QueryBuilderVisitorException { // Don't rewrite terms inside Consecutive if (insideConsecutive || insideSelect) { return term; } // NOTE: this Term node might be already inside a Select final ObjectArrayList<Query> disjuncts = new ObjectArrayList<Query>(); IndexIterator ii; try { if (alignmentIndex != null) { ii = alignmentIndex.documents(term.term); if (ii.mayHaveNext()) { long f = ii.nextDocument(); while (f != DocumentIterator.END_OF_LIST) { if (fieldNamesSet.contains(properties.get((int)f))) { // System.err.println( "From vertical index: " + // properties.get( f ) ); disjuncts.add(new Select(properties.get((int)f), term)); } f = ii.nextDocument(); } } ii.dispose(); } else { // No alignment index: we look in all fields for (String field : fieldNamesSet) { disjuncts.add(new Select(field, term)); } } disjuncts.add(new Select(defaultField, term)); } catch (IOException e) { throw new QueryBuilderVisitorException(e); } if (disjuncts.size() > 1) { return new Or(disjuncts.toArray(new Query[disjuncts.size()])); } else { return disjuncts.get(0); } } @Override public Query visit(Prefix node) throws QueryBuilderVisitorException { return node; } @Override public Query visit(Range node) throws QueryBuilderVisitorException { return node; } @Override public Query visit(True node) throws QueryBuilderVisitorException { return node; } @Override public Query visit(False node) throws QueryBuilderVisitorException { return node; } public Query visitPost(And node, Query[] subNode) throws QueryBuilderVisitorException { return new And(subNode); } public Query visitPost(Consecutive node, Query[] subNode) throws QueryBuilderVisitorException { insideConsecutive = false; if (insideSelect) { return new Consecutive(subNode); } // Create a disjunct of selects on all fields final ObjectArrayList<Query> disjuncts = new ObjectArrayList<Query>(); for (String property : properties) { if (fieldNamesSet.contains(property)) { disjuncts.add(new Select(property, new Consecutive(subNode))); } } disjuncts.add(new Select(defaultField, new Consecutive(subNode))); return new Or(disjuncts.toArray(new Query[disjuncts.size()])); } public Query visitPost(OrderedAnd node, Query[] subNode) throws QueryBuilderVisitorException { return new OrderedAnd(subNode); } public Query visitPost(Difference node, Query[] subNode) throws QueryBuilderVisitorException { return new Difference(subNode[0], subNode[1]); } public Query visitPost(LowPass node, Query subNode) throws QueryBuilderVisitorException { return new LowPass(subNode, node.k); } public Query visitPost(Not node, Query subNode) throws QueryBuilderVisitorException { return new Not(subNode); } public Query visitPost(Or node, Query[] subNode) throws QueryBuilderVisitorException { return new Or(subNode); } public Query visitPost(Align node, Query[] subNode) throws QueryBuilderVisitorException { return new Align(subNode[0], subNode[1]); } public Query visitPost(MultiTerm node, Query[] subNode) throws QueryBuilderVisitorException { return new Or(subNode); } public Query visitPost(Select node, Query subNode) throws QueryBuilderVisitorException { insideSelect = false; return new Select(node.index, subNode); } public Query visitPost(Remap node, Query subNode) throws QueryBuilderVisitorException { return new Remap(subNode, node.indexRemapping); } public Query visitPost(Weight node, Query subNode) throws QueryBuilderVisitorException { return new Weight(node.weight, subNode); } @Override public MyVisitor copy() { return new MyVisitor(); } @Override public Query visitPost(Annotation node, Query subNode) throws QueryBuilderVisitorException { return new Annotation( subNode ); } } }