// This file is part of AceWiki.
// Copyright 2008-2013, AceWiki developers.
//
// AceWiki is free software: you can redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// AceWiki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
// even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License along with AceWiki. If
// not, see http://www.gnu.org/licenses/.
package ch.uzh.ifi.attempto.acewiki.gf;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Functions;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import ch.uzh.ifi.attempto.acewiki.core.Ontology;
import ch.uzh.ifi.attempto.gfservice.GfModule;
import ch.uzh.ifi.attempto.gfservice.GfParseResult;
import ch.uzh.ifi.attempto.gfservice.GfService;
import ch.uzh.ifi.attempto.gfservice.GfServiceException;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultBrowseAll;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultComplete;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultGrammar;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultLinearize;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultLinearizeAll;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultParse;
import ch.uzh.ifi.attempto.gfservice.GfServiceResultRandom;
import ch.uzh.ifi.attempto.gfservice.GfStorage;
import ch.uzh.ifi.attempto.gfservice.GfStorageResult;
import ch.uzh.ifi.attempto.gfservice.GfStorageResultLs;
import ch.uzh.ifi.attempto.gfservice.gfwebservice.GfWebService;
import ch.uzh.ifi.attempto.gfservice.gfwebservice.GfWebStorage;
/**
* This class wraps GF features of a particular GF grammar.
*
* TODO: move ACE-specific stuff out of this class
*
* @author Kaarel Kaljurand
*/
public class GfGrammar {
// TODO: let the user configure the size of the ambiguity
public final static int GF_PARSE_LIMIT = 10;
private final int LINEARIZE_ALL_QUERY_LIMIT;
private final Logger mLogger = LoggerFactory.getLogger(GfGrammar.class);
// Some naming conventions
public final static String PREFIX_DISAMB = "Disamb";
public final static String SUFFIX_APE = "Ape";
public final static String EXTENSION_GF = ".gf";
public final static String EXTENSION_GFO = ".gfo";
// Note that true can remove (always removes?) lins
// which are not available in all the concretes,
// i.e. if you add a lin then you need to add it too all the concretes
// otherwise you cannot use it in a sentence.
private final static boolean OPTIMIZE_PGF = true;
private final static int GF_APE_FIELD_LOGICAL_SYMBOL = 3;
private final static char GF_TOKEN_SEPARATOR = ' ';
private final static char GF_TREE_SEPARATOR = '|';
private final static char GF_APE_SEPARATOR = '|';
private final static String GF_SERIALIZATION_SEPARATOR = "||";
public final static Joiner GF_TREE_JOINER = Joiner.on(GF_TREE_SEPARATOR);
public final static Joiner GF_SERIALIZATION_JOINER = Joiner.on(GF_SERIALIZATION_SEPARATOR).useForNull("");
public final static Joiner GF_TOKEN_JOINER = Joiner.on(GF_TOKEN_SEPARATOR);
public final static Splitter GF_TREE_SPLITTER = Splitter.on(GF_TREE_SEPARATOR).omitEmptyStrings();
public final static Splitter GF_APE_SPLITTER = Splitter.on(GF_APE_SEPARATOR);
public final static Splitter GF_SERIALIZATION_SPLITTER = Splitter.on(GF_SERIALIZATION_SEPARATOR);
public final static Splitter GF_TOKEN_SPLITTER = Splitter.on(GF_TOKEN_SEPARATOR);
private final GfService mGfService;
private final GfStorage mGfStorage;
private final String mCat;
private final String mDir;
private GfServiceResultGrammar mGfServiceResultGrammar;
private GfServiceResultBrowseAll mGfServiceResultBrowseAll;
private final Map<String, Multimap<String, String>> langToTokenToCats = Maps.newHashMap();
private final Map<String, Map<String, String>> langToIriToToken = Maps.newHashMap();
// TODO: could use a Multiset instead but there does not seem to be a
// short way to get out k-largest elements.
private final Map<String, Integer> mCatToSize = Maps.newHashMap();
public GfGrammar(Ontology ontology) {
URI serviceUri;
try {
serviceUri = new URI(ontology.getParameter("service_uri"));
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
String pgfName = ontology.getParameter("pgf_name");
mGfService = new GfWebService(serviceUri, pgfName);
mGfStorage = new GfWebStorage(serviceUri);
// Note: start_cat can be null, in this case the default start category is used
mCat = ontology.getParameter("start_cat");
mDir = getDir(pgfName);
LINEARIZE_ALL_QUERY_LIMIT = ontology.getParameterAsInt("linearize_all_query_limit");
try {
refreshGrammarInfo();
refreshLangToTokenToCats();
} catch (GfServiceException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public GfServiceResultGrammar getGrammar() {
return mGfServiceResultGrammar;
}
/**
* @return set of names of the concrete languages defined in the grammar
*/
public Set<String> getLanguages() {
if (mGfServiceResultGrammar == null) {
return Collections.emptySet();
}
return mGfServiceResultGrammar.getLanguages().keySet();
}
/**
* @return set of locales defined for the given language in the grammar
*/
public Set<String> getLocales(String lang) {
if (mGfServiceResultGrammar == null) {
return Collections.emptySet();
}
return mGfServiceResultGrammar.getLanguages().get(lang);
}
/**
* @return {@code true} iff the given grammar contains a concrete language with suffix SUFFIX_APE
*/
public boolean isAceCompatible() {
return getLanguages().contains(mGfServiceResultGrammar.getName() + SUFFIX_APE);
}
/**
* Parses the given text in the given language.
*
* @param text The text.
* @param language The language.
* @return The parse result.
* @throws GfServiceException
*/
public Set<String> parse(String text, String language) throws GfServiceException {
GfServiceResultParse result = mGfService.parse(mCat, text, language, GF_PARSE_LIMIT);
return result.getTrees(language);
}
public String random() throws GfServiceException {
return random(1).iterator().next();
}
public List<String> random(int limit) throws GfServiceException {
GfServiceResultRandom result = mGfService.random(mCat, limit);
return result.getTrees();
}
/**
* Serializes the GF wiki entry, given as 3 components:
* - language (e.g. GeographyEng)
* - sentence as string (e.g. "Germany is a country .")
* - set of corresponding trees
*
* The format is:
*
* lang||text||tree1|tree2|...|treeN
*
* This is more robust, e.g. if the tree cannot be linearized anymore
* because grammar was refactored then we could try to parse the
* sentence. Also the sentence could be shown if the tree
* has multiple variant lins.
*/
public static String serialize(GfWikiEntry entry) {
return GF_SERIALIZATION_JOINER.join(
entry.getLanguage(),
entry.getText(),
GF_TREE_JOINER.join(entry.getTrees().getTrees()));
}
/**
* Deserializes a GF wiki entry.
*/
public static GfWikiEntry deserialize(String serialized) {
List<String> splitsAsList = ImmutableList.copyOf(GF_SERIALIZATION_SPLITTER.split(serialized));
if (splitsAsList.size() == 1) {
// deprecated form, containing just the trees
return new GfWikiEntry(new TreeList(GF_TREE_SPLITTER.split(serialized)));
} else if (splitsAsList.size() == 3) {
Iterable<String> trees = GF_TREE_SPLITTER.split(splitsAsList.get(2));
return new GfWikiEntry(
splitsAsList.get(0),
splitsAsList.get(1),
new TreeList(trees));
}
throw new RuntimeException("Syntax error: " + serialized);
}
public Set<String> linearize(String tree, String language) throws GfServiceException {
GfServiceResultLinearize result = mGfService.linearize(tree, language);
return result.getTexts(language);
}
public Map<String, Set<String>> linearize(String tree) throws GfServiceException {
GfServiceResultLinearize result = mGfService.linearize(tree, null);
return result.getTexts();
}
public Set<String> complete(List<String> tokens, String language) throws GfServiceException {
return complete(mCat, tokens, language);
}
/**
* <p>This method tries to return a set that contains more than one element, i.e.
* if there is only one (unambiguous) completion then "complete" is automatically
* called again. In this case the result set contains multi-token completions.
* There is a limit of 15 tokens to each completion.</p>
*
* @param cat start category for the parser
* @param tokens list of tokens the last of which is to be completed
* @param language language of the input tokens
* @return list of possible completions
* @throws GfServiceException
*/
public Set<String> complete(String cat, List<String> tokens, String language) throws GfServiceException {
// Remove the last argument if this behavior turns out to be confusing
// Removed it (was 15), it seemed to be buggy in some cases.
GfServiceResultComplete result = mGfService.complete(cat, getCompletionInput(tokens), language, null);
return result.getCompletions(language);
}
public String abstrtree(String tree) throws GfServiceException {
return mGfService.abstrtree(tree).getDataUri();
}
public String parsetree(String tree, String from) throws GfServiceException {
return mGfService.parsetree(tree, from).getDataUri();
}
public String alignment(String tree) throws GfServiceException {
return mGfService.alignment(tree).getDataUri();
}
public Set<String> getProducers(String cat) {
return mGfServiceResultBrowseAll.getProducers(cat);
}
public Set<String> getConsumers(String cat) {
return mGfServiceResultBrowseAll.getConsumers(cat);
}
public String getCategoryName(String cat, String language) {
return mGfServiceResultBrowseAll.getCategoryName(cat, language);
}
/**
* <p>Returns the {@code k} largest categories in the order of size.
* The size is in terms of the number of producer functions that are
* not consumer functions.</p>
*/
public List<String> getLargestCategories(int k) {
return Ordering.natural().onResultOf(Functions.forMap(mCatToSize)).greatestOf(mCatToSize.keySet(), k);
}
public Multimap<String, String> getTokenToCats(String language) {
return langToTokenToCats.get(language);
}
public Map<String, String> getIriToToken(String language) {
return langToIriToToken.get(language);
}
public GfParseResult parseGfModule(GfModule gfModule) throws GfServiceException {
return mGfStorage.parse(gfModule);
}
/**
* Uploads the given GF module to the server.
*/
public void upload(GfModule module) throws GfServiceException {
mGfStorage.upload(mDir, module);
}
public Set<String> ls(String extension) throws GfServiceException {
GfStorageResultLs result = mGfStorage.ls(mDir, extension);
return result.getFilenames();
}
public void rm(String path) throws GfServiceException {
mGfStorage.rm(mDir, path);
}
public int rmGfo() throws GfServiceException {
int count = 0;
for (String path : ls(EXTENSION_GFO)) {
mGfStorage.rm(mDir, path);
count++;
}
return count;
}
public String downloadAsString(String filename) throws GfServiceException {
return mGfStorage.downloadAsString(mDir, filename);
}
/**
* Updates the grammar based on the given GF module, which is either
* a new component of the grammar or which has undergone modifications
* and needs to be reintegrated.
*
* @param gfModule new or modified grammar module
* @return GfStorageResult
* @throws GfServiceException
*/
public GfStorageResult integrateGfModule(GfModule gfModule) throws GfServiceException {
Set<String> languages = getLanguages();
GfStorageResult result = null;
if (isToplevelModule(gfModule, languages)) {
// If the module is a (toplevel) concrete syntax module then
// update it in the context of other concrete modules.
result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages, gfModule);
} else {
// Otherwise just upload it and recompile the existing concrete modules.
mGfStorage.upload(mDir, gfModule);
result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages);
}
if (result != null && result.isSuccess()) {
refreshGrammarInfo();
refreshLangToTokenToCats();
}
return result;
}
/**
* Recompiles the grammar.
*/
public GfStorageResult update() throws GfServiceException {
Set<String> languages = getLanguages();
GfStorageResult result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages);
if (result != null && result.isSuccess()) {
refreshGrammarInfo();
refreshLangToTokenToCats();
}
return result;
}
public boolean isGrammarEditable() {
return ! (mDir == null);
}
/**
* True if the module is a concrete syntax module which no other
* module imports. We check if the module name has the form
* {@code GrammarLan}. This covers also modules
* which were added after the wiki was started up. The previous
* technique {@code languages.contains(gfModule.getName())} did not
* cover the new modules.
*/
private boolean isToplevelModule(GfModule gfModule, Set<String> languages) {
String moduleName = gfModule.getName();
if (languages.contains(moduleName)) {
return true;
}
if (mGfServiceResultGrammar == null) {
return false;
}
String grammarName = mGfServiceResultGrammar.getName();
return (
moduleName.startsWith(grammarName) &&
moduleName.length() >= grammarName.length() + 3 &&
Character.isUpperCase(moduleName.charAt(grammarName.length()))
||
moduleName.startsWith(PREFIX_DISAMB + grammarName) &&
moduleName.length() >= PREFIX_DISAMB.length() + grammarName.length() + 3 &&
Character.isUpperCase(moduleName.charAt(PREFIX_DISAMB.length() + grammarName.length()))
);
}
// TODO: we assume that editable directories have a certain form
private static String getDir(String str) {
Pattern p = Pattern.compile("(/tmp/.+)/.+");
Matcher m = p.matcher(str);
if (m.matches()) {
return m.group(1);
}
return null;
}
private static String getCompletionInput(List<String> tokens) {
if (tokens.isEmpty()) {
return "";
}
return GF_TOKEN_JOINER.join(tokens) + GF_TOKEN_SEPARATOR;
}
private void refreshGrammarInfo() throws GfServiceException {
mGfServiceResultGrammar = mGfService.grammar();
mGfServiceResultBrowseAll = mGfService.browseAll();
}
/**
* <p>Creates a structure from which you can look up the categories of tokens.</p>
*
* <pre>
* language -> token -> categories
* </pre>
*/
private void refreshLangToTokenToCats() throws GfServiceException {
// Collect together all the consumer functions.
// TODO We are not interested in their linearizations, at least for the time begin.
Set<String> funsAllConsumers = Sets.newHashSet();
Set<String> cats = mGfServiceResultBrowseAll.getCategories();
for (String cat : cats) {
funsAllConsumers.addAll(getConsumers(cat));
}
int countAllFuns = mGfServiceResultGrammar.getFunctions().size();
int countIgnoreFuns = funsAllConsumers.size();
mLogger.info("All funs: {}, (ignored) consumer funs: {}", countAllFuns, countIgnoreFuns);
if (countAllFuns - countIgnoreFuns > LINEARIZE_ALL_QUERY_LIMIT) {
mLogger.warn("Refusing to build preditor cache, as there are too many producer-only funs. " +
"Increase LINEARIZE_ALL_QUERY_LIMIT if its current value {} is too low.", LINEARIZE_ALL_QUERY_LIMIT);
return;
}
langToTokenToCats.clear();
mCatToSize.clear();
langToIriToToken.clear();
// Iterate over all the categories that have producer functions
for (String cat : cats) {
mCatToSize.put(cat, 0);
// For each category look at its producers
for (String f : getProducers(cat)) {
// If this function is also a consumer, then throw it out
if (funsAllConsumers.contains(f)) {
continue;
}
// Increment the counter of producers that are not consumers for this category
mCatToSize.put(cat, mCatToSize.get(cat) + 1);
// Otherwise get all of its linearizations in all the languages.
// This includes all the wordforms and variants, because the linearization
// is likely to be a complex record that holds many strings.
GfServiceResultLinearizeAll result = mGfService.linearizeAll(f, null);
Map<String, List<String>> langToTokens = result.getTexts();
// Extract the logical symbol that corresponds to this function.
// The logical symbol is present in the Ape-linearization.
String logicalSymbol = extractLogicalSymbolFromApe(langToTokens.get(mGfServiceResultGrammar.getName() + SUFFIX_APE));
for (Entry<String, List<String>> entry2 : langToTokens.entrySet()) {
String lang = entry2.getKey();
Multimap<String, String> tokenToCats = langToTokenToCats.get(lang);
// If we haven't seen this language before then create a new hash table entry for it
if (tokenToCats == null) {
tokenToCats = HashMultimap.create();
langToTokenToCats.put(lang, tokenToCats);
}
// Store each linearization together with its category.
// The linearization is represented by its "most important" token.
for (String lin : entry2.getValue()) {
String indexToken = getIndexToken(lin);
if (indexToken != null) {
tokenToCats.put(indexToken, cat);
}
}
if (logicalSymbol != null) {
Map<String, String> iriToToken = langToIriToToken.get(lang);
// If we haven't seen this language before then create a new hash table entry for it
if (iriToToken == null) {
iriToToken = Maps.newHashMap();
langToIriToToken.put(lang, iriToToken);
}
for (String lin : entry2.getValue()) {
iriToToken.put(logicalSymbol, lin);
// TODO: We assume that the dictionary form is always the first.
// Of course, this does not always hold.
// Unfortunately, LinearizeAll cannot be used to obtain a GF record,
// with all the category labels of the strings, but just a list of plain strings.
break;
}
}
}
}
}
}
/**
* It does not make sense to index linearizations which contain multiple tokens
* or which are empty strings, as these cannot be matched during (single token)
* lookahead editing. If there are multiple tokens in the given linearization, e.g.
* the + Atlantic_Ocean, des + Atlantischen_Ozeans, Atlandi_Ookean + &+ + il;
* then we return the longest token (picking the last one in case there are several).
* TODO: this is a hack while we're waiting for a cleaner solution.
*/
private static String getIndexToken(String lin) {
int max = 0;
String returnTok = null;
for (String tok : GF_TOKEN_SPLITTER.omitEmptyStrings().split(lin)) {
if (tok.length() >= max) {
max = tok.length();
returnTok = tok;
}
}
return returnTok;
}
/**
* <p>Extracts the logical symbol (which is used by APE as the
* OWL entity IRI) from the Ape-linearization of a function, assuming
* that the function is a lexical function.
* Returns {@code null} in case the extraction fails.</p>
*
* <p>We assume that the Ape linearizations have the form
* {@code The_Hague|pn_sg|The_Hague_PN|neutr}, where the logical symbol
* is always in the same field and is always the same in case there are
* several linearizations.</p>
*/
private static String extractLogicalSymbolFromApe(List<String> lins) {
if (lins == null || lins.isEmpty()) {
return null;
}
int count = 0;
for (String field : GF_APE_SPLITTER.split(lins.get(0))) {
if (++count == GF_APE_FIELD_LOGICAL_SYMBOL) {
return field;
}
}
return null;
}
}