Grammar.java example

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi.server;

import java.io.File;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;

import jpl.Atom;
import jpl.Query;
import jpl.Term;
import jpl.Util;
import jpl.Variable;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.core.QueryNodeException;

import de.spartusch.StringMethods;

/**
 * Interface to the natural language grammar. This class provides access to
 * the natural language grammar implemented in Prolog. This implementation
 * uses SWI-Prolog's JPL. The system property <code>java.library.path</code>
 * must include a path to SWI-Prolog for this implementation to work properly.
 * @author Stefan Partusch
 * @see <a href="http://www.swi-prolog.org/">www.swi-prolog.org</a>
 *
 */
public class Grammar {
	/** Grammatical tenses supported by the natural language grammar. */
	public enum Tense {
		pqperf, perf, praet, praes, fut1
	};

	private static final Logger LOGGER =
		Logger.getLogger(Grammar.class.getName());

	/**
	 * Loads the natural language grammar from a file. Consults a single
	 * file to an instance of SWI-Prolog. <code>file</code> must reference
	 * all required Prolog source files.
	 * @param file File to load the natural language grammar from 
	 */
	public Grammar(final File file)  { 
		LOGGER.info("Using grammar " + file);
		Query consult = new Query("consult",
				new Term[] { new Atom(file.getAbsolutePath()) });

		if (!consult.hasSolution()) {
			String msg = "Consulting " + file + " failed";
			LOGGER.severe(msg);
			throw new RuntimeException(msg);
		}
	}

	/**
	 * Solves a Prolog goal and logs this. Returns the bindings of the goal's 
	 * variables. Each binding maps a variable's name to its bound term.
	 * @param goal Goal to solve
	 * @return Bindings of the goal's variables
	 */
	@SuppressWarnings("unchecked")
	private Map<String, Term> solve(final Query goal) {
		Map<String, Term> bindings = (Map<String, Term>) goal.oneSolution();

		if (LOGGER.isLoggable(Level.INFO)) {
			StringBuilder sb = new StringBuilder();

			sb.append(goal);

			if (bindings != null) {
				for (Map.Entry<String, Term> entry : bindings.entrySet()) {
					String value = entry.getValue().toString();
					if (value.length() < 100) {
						sb.append("\n\t").append(entry.getKey());
						sb.append(" = ").append(value);
					}
				}
			} else {
				sb.append("\nNo solution");
			}

			LOGGER.info(sb.toString());
		}

		return bindings;
	}

	/**
	 * Completes the input to sentences. This implementation calls
	 * <code>suggest/4</code> in the natural language grammar.
	 * @param input Input to suggest sentences for
	 * @return Suggestions for the input
	 */
	public final Set<String> suggest(final String input) {
		Set<String> suggestions = new TreeSet<String>();
		Term[] args = new Term[] {
			new Atom(input),
			new jpl.Integer(8),
			new Variable("Markiertheit"),
			new Variable("Vorschlaege")
		};

		Map<String, Term> bindings = solve(new Query("suggest", args));
		
		if (bindings == null) {
			return suggestions;
		}

		for (Term t : Util.listToTermArray(bindings.get("Vorschlaege"))) {
			String[] tokens = Util.atomListToStringArray(t);
			suggestions.add(fromProlog(tokens).toString());
		}

		return suggestions;
	}

	/**
	 * Analyzes a sentence and creates a {@link NQuery} accordingly. This
	 * implementation calls <code>parse/5</code> in the natural language
	 * grammar.
	 * @param input The sentence to analyze
	 * @param analyzer Analyzer to use when creating the <code>NQuery</code>
	 * @return A <code>NQuery</code> according to the <code>input</code>
	 * @throws QueryNodeException If creating the <code>NQuery</code> fails
	 */
	public final NQuery parse(final String input, final Analyzer analyzer)
			throws QueryNodeException {
		Term[] args = new Term[] {
				new Atom(input),
				new Variable("Tempus"),
				new Variable("Query"),
				new Variable("SimilQuery"),
				new Variable("Gesucht")
		};

		Map<String, Term> bindings = solve(new Query("parse", args));

		if (bindings == null) {
			return null;
		}

		String tempus = bindings.get("Tempus").toString();
		String query = bindings.get("Query").toString();
		String similQuery = bindings.get("SimilQuery").toString();
		String[] fields = Util.atomListToStringArray(bindings.get("Gesucht"));

		return new NQuery(Tense.valueOf(tempus), query, similQuery, fields,
				analyzer);
	}

	/**
	 * Generates an answer to a question. The input is analyzed to obtain a
	 * skeleton to create the answer in natural language with by inserting
	 * the <code>answerValues</code>. The <code>answerValues</code> are a
	 * mapping from field names to the values of the fields. This
	 * implementation calls <code>beantworte/5</code> in the natural language
	 * grammar. 
	 * @param input The question in natural language to answer
	 * @param answerValues Values to insert into the answer
	 * @return Answer in natural language to <code>input</code>
	 */
	public final String generate(final String input,
			final Map<String, Set<String>> answerValues) {
		Term[] termValues = new Term[answerValues.size()];
		int i = 0;

		for (Map.Entry<String, Set<String>> e : answerValues.entrySet()) {
			String key = e.getKey();
			String[] arr = new String[e.getValue().size() + 1];
			Iterator<String> iter = e.getValue().iterator();

			arr[0] = key;
			for (int j = 1; iter.hasNext(); j++) {
				String value = iter.next();
				if (StringMethods.equalsOneOf(key,
						new String[]{"semester", "tag"})) {
					arr[j] = value.toLowerCase(Locale.GERMAN);
				} else {
					arr[j] = "\"" + value + "\"";
				}
			}

			termValues[i] = Util.stringArrayToList(arr);
			i++;
		}

		Term[] args = new Term[] {
				new Atom(input),
				Util.termArrayToList(termValues),
				new Variable("AnalyseAnfrage"),
				new Variable("AnalyseAntwort"),
				new Variable("Antwort")
		};

		Map<String, Term> bindings = solve(new Query("beantworte", args));

		if (bindings == null) {
			throw new AssertionError("No bindings received");
		}

		String analysisReq = prettyPrint(bindings.get("AnalyseAnfrage"));
		String analysisAns = prettyPrint(bindings.get("AnalyseAntwort"));
		String[] ansTokens =
			Util.atomListToStringArray(bindings.get("Antwort"));
		String answer = toJsonString(fromProlog(ansTokens), true);

		StringBuilder sb = new StringBuilder("{\n\"AnalysisReq\": ");
		sb.append(analysisReq).append(",\n\"AnalysisAns\": ");
		sb.append(analysisAns).append(",\n\"Answer\": ");
		sb.append(answer).append("\n}\n");

		return sb.toString();
	}

	/**
	 * Pretty-prints a Prolog list to a string. This implementation calls
	 * {@link #prettyPrint(Term)} on each item of <code>list</code>
	 * recursively and ignores the final empty list.
	 * @param list List to print
	 * @return  Pretty-printed Prolog list
	 */
	private String prettyPrintList(final Term list) {
		if (list.isCompound() && list.hasFunctor(".", 2)) {
			String head = prettyPrint(list.arg(1));
			String tail = prettyPrintList(list.arg(2));

			if (tail.equals("[]") || tail.isEmpty()) {
				return head;
			}

			return head + ", " + tail;
		}
		return "";
	}

	/**
	 * Pretty-prints a Prolog term to JSON. This implementation handles the
	 * operators defined in the natural language grammar ('?', '-', '>', '*',
	 * 'und', 'oder', 'lam', 'qu', 'ex') properly.
	 * @param term Term to pretty-print
	 * @return Pretty-printed term in JSON
	 */
	private String prettyPrint(final Term term) {
		String result = null;
		
		if (term.isCompound()) {
			if (term.hasFunctor(".", 2)) {
				String head = prettyPrint(term.arg(1));
				String tail = prettyPrintList(term.arg(2));

				if (tail.isEmpty()) {
					return "[" + head + "]";
				}

				return "[" + head + ", " + tail + "]";
				// no 'JSONification' because prolog lists are valid JSON arrays
			} else if (term.hasFunctor("?", 1)) {
				result = "?" + prettyPrint(term.arg(1));
			} else if (term.hasFunctor("-", 2)) {
				result = prettyPrint(term.arg(1)) + "-"
				+ prettyPrint(term.arg(2));
			} else if (term.arity() == 2) {
				String op = term.name();
				Term t1 = term.arg(1);
				Term t2 = term.arg(2);
				
				if (StringMethods.equalsOneOf(op,
						new String[]{"und", "oder", "*"})) {
					result = prettyPrint(t1) + " " + op + " " + prettyPrint(t2);
				} else if (StringMethods.equalsOneOf(op,
						new String[]{"lam", "qu", "ex"})) {
					result = op + "(" + t1.toString() + ", "
						+ prettyPrint(t2) + ")";
				} else if (op.equals(">")) {
					result = prettyPrint(t1) + op + prettyPrint(t2);
				}
			}
		}
		
		if (result == null) {
			result = term.toString();
		}

		return toJsonString(result, false);
	}

	/**
	 * Converts a string to a JSON string.
	 * @param string String to convert
	 * @param escape true to escape quotation marks, false to remove
	 * quotation marks
	 * @return A JSON string
	 */
	public static String toJsonString(final String string,
			final boolean escape) {
		return toJsonString(new StringBuilder(string), escape);
	}

	/**
	 * Creates a JSON string from a StringBuilder. The data of the
	 * StringBuilder is converted in-place to JSON.
	 * @param builder StringBuilder to use
	 * @param escape true to escape quotation marks, false to remove
	 * quotation marks
	 * @return A JSON string
	 */
	public static String toJsonString(final StringBuilder builder,
			final boolean escape) {
		int pos = -1;

		while ((pos = builder.indexOf("\"", pos + 1)) != -1) {
			if (escape && (pos == 0 || (pos > 1 && builder.charAt(pos - 1)
					!= '\\'))) {
				builder.insert(pos, '\\');
			} else if (!escape) {
				builder.deleteCharAt(pos);
			}
		}

		builder.insert(0, '"');
		builder.append('"');

		return builder.toString();
	}

	/**
	 * Processes a natural language sentence returned from Prolog. This method
	 * concatenates each token, restores German umlauts, quotes blackboxes and
	 * converts the first character to uppercase.
	 * @param tokens Tokens of the sentence
	 * @return Processed sentence
	 * @see {@link #toProlog(String) toProlog}
	 */
	private static StringBuilder fromProlog(final String[] tokens) {
		StringBuilder sb = new StringBuilder();

		for (String tok : tokens) {
			if (tok.charAt(0) != '"') {	// Token is no black box
				tok = tok.replace("Ae", "\u00C4");
				tok = tok.replace("Oe", "\u00D6");
				tok = tok.replace("Ue", "\u00DC");
				tok = tok.replace("ae", "\u00E4");
				tok = tok.replace("oe", "\u00F6");
				tok = tok.replaceAll("ue(?!n\\b)", "\u00FC");
				tok = tok.replace("ss", "\u00DF");
				sb.append(tok).append(" ");
			} else { // Token is black box
				boolean quote = tok.indexOf(' ') != -1;
				// test if multiple words
				int start = 1;

				if (tok.charAt(start) == '#') {
					start++; // skip #
				}

				Character upper = Character.toUpperCase(tok.charAt(start));
				if (quote) {
					sb.append('"');
				}
				sb.append(upper);
				sb.append(tok.substring(start + 1, tok.length() - 1));
				if (quote) {
					sb.append('"');
				}
				sb.append(" ");
			}
		}
		
		Character first = sb.charAt(0);
		sb.deleteCharAt(0);
		sb.insert(0, Character.toUpperCase(first));

		return sb;
	}

	/**
	 * Normalizes input for use with the natural language grammar in Prolog.
	 * This method converts the input to lowercase, encodes German umlauts
	 * with ASCII characters (ae, oe, ue, ...) and removes meta-characters not
	 * suitable for processing by the natural language grammar. 
	 * @param input Input to normalize
	 * @return Normalized <code>input</code>
	 * @see {@link #fromProlog(String[]) fromProlog}
	 */
	public static String toProlog(final String input) {
		StringBuilder sb = new StringBuilder();
		boolean inBlackBox = false;

		for (int i = 0; i < input.length(); i++) {
			int ch = input.codePointAt(i);
			if (ch != '"') {
				if (!inBlackBox) {
					ch = Character.toLowerCase(ch);
					switch(ch) {
						case '\u00E4':
							sb.append("ae");
							break;
						case '\u00F6':
							sb.append("oe");
							break;
						case '\u00FC':
							sb.append("ue");
							break;
						case '\u00DF':
							sb.append("ss");
							break;
						case '\\':
							sb.append("/");
							break;
						default:
							if (Character.isLetterOrDigit(ch)
									|| ch == '/' || ch == '.' || ch == '-') {
								sb.appendCodePoint(ch);
							} else {
								sb.append(" ");
							}
					}
				} else {
					sb.appendCodePoint(ch);
				}
			} else {
				inBlackBox = !inBlackBox;
				sb.append('"');
			}
		}

		return sb.toString().trim();
	}
}