/* * Copyright 2011 Stefan Partusch * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.spartusch.nasfvi.server; import java.util.GregorianCalendar; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.core.QueryNodeException; import org.apache.lucene.queryParser.core.processors.QueryNodeProcessorPipeline; import org.apache.lucene.queryParser.standard.StandardQueryParser; import org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler.Operator; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import de.spartusch.StringMethods; /** * A query for use with {@link NSearcher}. * @author Stefan Partusch * */ public class NQuery { /** The default search field in Lucene. */ private static final String DEFAULT_SEARCH_FIELD = "titel"; /** Fields in the original query to collapse. */ private static final String[] FIELDS_TO_COLLAPSE = new String[]{"raum", "tag"}; /** Name of the field to collapse fields to. */ private static final String COLLAPSE_TO = "termin"; /** Regular expression to extract day information * from {@link collapseTo}. */ private static final Pattern COLLAPSED_FIELD_TAG = Pattern.compile("(?:^| )?(mo|di|mi|do|fr|sa|so)\\b", Pattern.CASE_INSENSITIVE); /** Regular expression to extract location information * from {@link collapseTo}. */ private static final Pattern COLLAPSED_FIELD_RAUM = Pattern.compile("(Rechnerraum(?: \\w+)?)|Raum (.+)|(\\w+straße.*)", Pattern.CASE_INSENSITIVE); /** The primary query. */ private Query query; /** The similarity query. */ private Query similQuery; /** Names of fields with values for the answer in natural language. */ private Set<String> answerFields; /** Analyzer used for creating Lucene queries. */ private Analyzer analyzer; /** true if the user query contains a semester. */ private boolean semesterQueried; /** Tense of the natural language question that forms the basis of * this query. */ private Grammar.Tense tense; /** * Creates a new query. * @param tense Grammatical tense of the question that forms the basis * of the new query * @param queryString Lucene query as returned by * the {@link Grammar grammar} * @param similQueryString Lucene query for similarity as returned * by the grammar * @param answerFields Names of fields with values for the answer * to be generated * @param analyzer Analyzer to use for handling Lucene queries * @throws QueryNodeException If parsing the query strings fails * @see {@link Grammar#parse(String, Analyzer) Grammar.parse} */ public NQuery(final Grammar.Tense tense, final String queryString, final String similQueryString, final String[] answerFields, final Analyzer analyzer) throws QueryNodeException { StandardQueryParser qp = new StandardQueryParser(analyzer); qp.setDefaultOperator(Operator.AND); QueryNodeProcessorPipeline processors = (QueryNodeProcessorPipeline) qp.getQueryNodeProcessor(); processors.add(new FieldsCollapsingProcessor(FIELDS_TO_COLLAPSE, COLLAPSE_TO, 50)); String qs = queryString.replaceAll("'", ""); if (qs.isEmpty()) { query = new MatchAllDocsQuery(); } else { query = qp.parse(qs, DEFAULT_SEARCH_FIELD); } String sqs = similQueryString.replaceAll("'", ""); if (!sqs.isEmpty()) { similQuery = qp.parse(sqs, DEFAULT_SEARCH_FIELD); } if (queryString.contains("semester:\"") || similQueryString.contains("semester:\"")) { semesterQueried = true; } else { semesterQueried = false; query = interpretTense(tense, query); if (similQuery != null) { similQuery = interpretTense(tense, similQuery); } } this.analyzer = analyzer; this.tense = tense; this.answerFields = new HashSet<String>(); for (String answField : answerFields) { this.answerFields.add(mapFieldname(answField)); } } public final Query getQuery() { return query; } public final Query getSimilarityQuery() { return similQuery; } public final boolean hasSimilarityQuery() { return similQuery != null; } public final Set<String> getFieldsToAnswer() { return answerFields; } public final Analyzer getAnalyzer() { return analyzer; } /** * Maps "virtual" field names to actual field names. That is "zeit" to * "semester" or, if the grammatical tense of the question is present or * the query contains a semester, to "tag" and "ort" to "raum". * @param field Field name to map * @return Mapped field name */ private String mapFieldname(final String field) { if ("zeit".equals(field)) { if (semesterQueried || Grammar.Tense.praes.equals(tense) || answerFields.contains("semester")) { return "tag"; } return "semester"; } else if ("ort".equals(field)) { return "raum"; } return field; } /** * Returns a representation of this query in JSON. */ @Override public final String toString() { StringBuilder sb = new StringBuilder(); // { // "Query": query, // "SQuery": similQuery, // "Fields": [answerFields] // } sb.append("{\n\"Query\": "); sb.append(Grammar.toJsonString(query.toString(), true)); sb.append(",\n\"SQuery\": "); if (hasSimilarityQuery()) { sb.append(Grammar.toJsonString(similQuery.toString(), true)); } else { sb.append("\"\""); } sb.append(",\n\"Fields\": ["); boolean first = true; for (String f : answerFields) { if (first) { first = false; } else { sb.append(", "); } sb.append("\"").append(f).append("\""); } sb.append("]\n}"); return sb.toString(); } /** * Extracts values for collapsed fields. Some fields are collapsed, i.e. * are merged into a single field. This methods extracts values for these * fields from the single field. * @param field Name of a collapsed field * @param value Value of the single field to extract from * @return Extracted value * @throws AssertionError if <code>field</code> is not a collapsed field */ public static String extractValue(final String field, final String value) { Pattern pattern = null; if ("tag".equals(field)) { pattern = COLLAPSED_FIELD_TAG; } else if ("raum".equals(field)) { pattern = COLLAPSED_FIELD_RAUM; } else { throw new AssertionError(); } Matcher m = pattern.matcher(value); if (m.find()) { for (int i = 1; i <= m.groupCount(); i++) { String match = m.group(i); if (match != null) { return match; } } } return "(Unbekannt)"; } /** * Tests if a field is to be collapsed. Such fields are merged with * other fields into a single field. * @param field Name of the field to test * @return true if <code>field</code> names a field to collapse */ public static boolean isFieldToCollapse(final String field) { return StringMethods.equalsOneOf(field, FIELDS_TO_COLLAPSE); } /** * Returns the name of the single field other fields are collapsed * to, i.e. are merged into. * @return Name of the field other fields are collapsed to */ public static String getMergedField() { return COLLAPSE_TO; } /** * Interprets the grammatical tense and extends the <code>query</code> * accordingly. * @param tense Grammatical tense to interpret * @param query Query to extend * @return Extended query according to the grammatical tense */ private static Query interpretTense(final Grammar.Tense tense, final Query query) { Semester now = new Semester(); Query tenseQuery; switch(tense) { case pqperf: int year = new GregorianCalendar().get(GregorianCalendar.YEAR) - 1; tenseQuery = new TermRangeQuery("semester_end", "19700101", Integer.toString(year) + "0221", true, false); break; case perf: tenseQuery = new TermRangeQuery("semester_beg", "19700101", now.getBegin(), true, false); break; case praet: tenseQuery = new TermRangeQuery("semester_beg", "19700101", now.getBegin(), true, true); break; case praes: tenseQuery = new TermQuery(new Term("semester", now.getCanonical())); break; case fut1: tenseQuery = new TermRangeQuery("semester_end", now.getEnd(), "29991231", false, true); break; default: throw new AssertionError(); } BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(query, BooleanClause.Occur.MUST); booleanQuery.add(tenseQuery, BooleanClause.Occur.MUST); return booleanQuery; } }