// This file is part of AceWiki. // Copyright 2008-2013, AceWiki developers. // // AceWiki is free software: you can redistribute it and/or modify it under the terms of the GNU // Lesser General Public License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // AceWiki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without // even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with AceWiki. If // not, see http://www.gnu.org/licenses/. package ch.uzh.ifi.attempto.acewiki.gf; import java.net.URI; import java.net.URISyntaxException; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Functions; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.google.common.collect.Ordering; import com.google.common.collect.Sets; import ch.uzh.ifi.attempto.acewiki.core.Ontology; import ch.uzh.ifi.attempto.gfservice.GfModule; import ch.uzh.ifi.attempto.gfservice.GfParseResult; import ch.uzh.ifi.attempto.gfservice.GfService; import ch.uzh.ifi.attempto.gfservice.GfServiceException; import ch.uzh.ifi.attempto.gfservice.GfServiceResultBrowseAll; import ch.uzh.ifi.attempto.gfservice.GfServiceResultComplete; import ch.uzh.ifi.attempto.gfservice.GfServiceResultGrammar; import ch.uzh.ifi.attempto.gfservice.GfServiceResultLinearize; import ch.uzh.ifi.attempto.gfservice.GfServiceResultLinearizeAll; import ch.uzh.ifi.attempto.gfservice.GfServiceResultParse; import ch.uzh.ifi.attempto.gfservice.GfServiceResultRandom; import ch.uzh.ifi.attempto.gfservice.GfStorage; import ch.uzh.ifi.attempto.gfservice.GfStorageResult; import ch.uzh.ifi.attempto.gfservice.GfStorageResultLs; import ch.uzh.ifi.attempto.gfservice.gfwebservice.GfWebService; import ch.uzh.ifi.attempto.gfservice.gfwebservice.GfWebStorage; /** * This class wraps GF features of a particular GF grammar. * * TODO: move ACE-specific stuff out of this class * * @author Kaarel Kaljurand */ public class GfGrammar { // TODO: let the user configure the size of the ambiguity public final static int GF_PARSE_LIMIT = 10; private final int LINEARIZE_ALL_QUERY_LIMIT; private final Logger mLogger = LoggerFactory.getLogger(GfGrammar.class); // Some naming conventions public final static String PREFIX_DISAMB = "Disamb"; public final static String SUFFIX_APE = "Ape"; public final static String EXTENSION_GF = ".gf"; public final static String EXTENSION_GFO = ".gfo"; // Note that true can remove (always removes?) lins // which are not available in all the concretes, // i.e. if you add a lin then you need to add it too all the concretes // otherwise you cannot use it in a sentence. private final static boolean OPTIMIZE_PGF = true; private final static int GF_APE_FIELD_LOGICAL_SYMBOL = 3; private final static char GF_TOKEN_SEPARATOR = ' '; private final static char GF_TREE_SEPARATOR = '|'; private final static char GF_APE_SEPARATOR = '|'; private final static String GF_SERIALIZATION_SEPARATOR = "||"; public final static Joiner GF_TREE_JOINER = Joiner.on(GF_TREE_SEPARATOR); public final static Joiner GF_SERIALIZATION_JOINER = Joiner.on(GF_SERIALIZATION_SEPARATOR).useForNull(""); public final static Joiner GF_TOKEN_JOINER = Joiner.on(GF_TOKEN_SEPARATOR); public final static Splitter GF_TREE_SPLITTER = Splitter.on(GF_TREE_SEPARATOR).omitEmptyStrings(); public final static Splitter GF_APE_SPLITTER = Splitter.on(GF_APE_SEPARATOR); public final static Splitter GF_SERIALIZATION_SPLITTER = Splitter.on(GF_SERIALIZATION_SEPARATOR); public final static Splitter GF_TOKEN_SPLITTER = Splitter.on(GF_TOKEN_SEPARATOR); private final GfService mGfService; private final GfStorage mGfStorage; private final String mCat; private final String mDir; private GfServiceResultGrammar mGfServiceResultGrammar; private GfServiceResultBrowseAll mGfServiceResultBrowseAll; private final Map<String, Multimap<String, String>> langToTokenToCats = Maps.newHashMap(); private final Map<String, Map<String, String>> langToIriToToken = Maps.newHashMap(); // TODO: could use a Multiset instead but there does not seem to be a // short way to get out k-largest elements. private final Map<String, Integer> mCatToSize = Maps.newHashMap(); public GfGrammar(Ontology ontology) { URI serviceUri; try { serviceUri = new URI(ontology.getParameter("service_uri")); } catch (URISyntaxException e) { throw new RuntimeException(e); } String pgfName = ontology.getParameter("pgf_name"); mGfService = new GfWebService(serviceUri, pgfName); mGfStorage = new GfWebStorage(serviceUri); // Note: start_cat can be null, in this case the default start category is used mCat = ontology.getParameter("start_cat"); mDir = getDir(pgfName); LINEARIZE_ALL_QUERY_LIMIT = ontology.getParameterAsInt("linearize_all_query_limit"); try { refreshGrammarInfo(); refreshLangToTokenToCats(); } catch (GfServiceException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public GfServiceResultGrammar getGrammar() { return mGfServiceResultGrammar; } /** * @return set of names of the concrete languages defined in the grammar */ public Set<String> getLanguages() { if (mGfServiceResultGrammar == null) { return Collections.emptySet(); } return mGfServiceResultGrammar.getLanguages().keySet(); } /** * @return set of locales defined for the given language in the grammar */ public Set<String> getLocales(String lang) { if (mGfServiceResultGrammar == null) { return Collections.emptySet(); } return mGfServiceResultGrammar.getLanguages().get(lang); } /** * @return {@code true} iff the given grammar contains a concrete language with suffix SUFFIX_APE */ public boolean isAceCompatible() { return getLanguages().contains(mGfServiceResultGrammar.getName() + SUFFIX_APE); } /** * Parses the given text in the given language. * * @param text The text. * @param language The language. * @return The parse result. * @throws GfServiceException */ public Set<String> parse(String text, String language) throws GfServiceException { GfServiceResultParse result = mGfService.parse(mCat, text, language, GF_PARSE_LIMIT); return result.getTrees(language); } public String random() throws GfServiceException { return random(1).iterator().next(); } public List<String> random(int limit) throws GfServiceException { GfServiceResultRandom result = mGfService.random(mCat, limit); return result.getTrees(); } /** * Serializes the GF wiki entry, given as 3 components: * - language (e.g. GeographyEng) * - sentence as string (e.g. "Germany is a country .") * - set of corresponding trees * * The format is: * * lang||text||tree1|tree2|...|treeN * * This is more robust, e.g. if the tree cannot be linearized anymore * because grammar was refactored then we could try to parse the * sentence. Also the sentence could be shown if the tree * has multiple variant lins. */ public static String serialize(GfWikiEntry entry) { return GF_SERIALIZATION_JOINER.join( entry.getLanguage(), entry.getText(), GF_TREE_JOINER.join(entry.getTrees().getTrees())); } /** * Deserializes a GF wiki entry. */ public static GfWikiEntry deserialize(String serialized) { List<String> splitsAsList = ImmutableList.copyOf(GF_SERIALIZATION_SPLITTER.split(serialized)); if (splitsAsList.size() == 1) { // deprecated form, containing just the trees return new GfWikiEntry(new TreeList(GF_TREE_SPLITTER.split(serialized))); } else if (splitsAsList.size() == 3) { Iterable<String> trees = GF_TREE_SPLITTER.split(splitsAsList.get(2)); return new GfWikiEntry( splitsAsList.get(0), splitsAsList.get(1), new TreeList(trees)); } throw new RuntimeException("Syntax error: " + serialized); } public Set<String> linearize(String tree, String language) throws GfServiceException { GfServiceResultLinearize result = mGfService.linearize(tree, language); return result.getTexts(language); } public Map<String, Set<String>> linearize(String tree) throws GfServiceException { GfServiceResultLinearize result = mGfService.linearize(tree, null); return result.getTexts(); } public Set<String> complete(List<String> tokens, String language) throws GfServiceException { return complete(mCat, tokens, language); } /** * <p>This method tries to return a set that contains more than one element, i.e. * if there is only one (unambiguous) completion then "complete" is automatically * called again. In this case the result set contains multi-token completions. * There is a limit of 15 tokens to each completion.</p> * * @param cat start category for the parser * @param tokens list of tokens the last of which is to be completed * @param language language of the input tokens * @return list of possible completions * @throws GfServiceException */ public Set<String> complete(String cat, List<String> tokens, String language) throws GfServiceException { // Remove the last argument if this behavior turns out to be confusing // Removed it (was 15), it seemed to be buggy in some cases. GfServiceResultComplete result = mGfService.complete(cat, getCompletionInput(tokens), language, null); return result.getCompletions(language); } public String abstrtree(String tree) throws GfServiceException { return mGfService.abstrtree(tree).getDataUri(); } public String parsetree(String tree, String from) throws GfServiceException { return mGfService.parsetree(tree, from).getDataUri(); } public String alignment(String tree) throws GfServiceException { return mGfService.alignment(tree).getDataUri(); } public Set<String> getProducers(String cat) { return mGfServiceResultBrowseAll.getProducers(cat); } public Set<String> getConsumers(String cat) { return mGfServiceResultBrowseAll.getConsumers(cat); } public String getCategoryName(String cat, String language) { return mGfServiceResultBrowseAll.getCategoryName(cat, language); } /** * <p>Returns the {@code k} largest categories in the order of size. * The size is in terms of the number of producer functions that are * not consumer functions.</p> */ public List<String> getLargestCategories(int k) { return Ordering.natural().onResultOf(Functions.forMap(mCatToSize)).greatestOf(mCatToSize.keySet(), k); } public Multimap<String, String> getTokenToCats(String language) { return langToTokenToCats.get(language); } public Map<String, String> getIriToToken(String language) { return langToIriToToken.get(language); } public GfParseResult parseGfModule(GfModule gfModule) throws GfServiceException { return mGfStorage.parse(gfModule); } /** * Uploads the given GF module to the server. */ public void upload(GfModule module) throws GfServiceException { mGfStorage.upload(mDir, module); } public Set<String> ls(String extension) throws GfServiceException { GfStorageResultLs result = mGfStorage.ls(mDir, extension); return result.getFilenames(); } public void rm(String path) throws GfServiceException { mGfStorage.rm(mDir, path); } public int rmGfo() throws GfServiceException { int count = 0; for (String path : ls(EXTENSION_GFO)) { mGfStorage.rm(mDir, path); count++; } return count; } public String downloadAsString(String filename) throws GfServiceException { return mGfStorage.downloadAsString(mDir, filename); } /** * Updates the grammar based on the given GF module, which is either * a new component of the grammar or which has undergone modifications * and needs to be reintegrated. * * @param gfModule new or modified grammar module * @return GfStorageResult * @throws GfServiceException */ public GfStorageResult integrateGfModule(GfModule gfModule) throws GfServiceException { Set<String> languages = getLanguages(); GfStorageResult result = null; if (isToplevelModule(gfModule, languages)) { // If the module is a (toplevel) concrete syntax module then // update it in the context of other concrete modules. result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages, gfModule); } else { // Otherwise just upload it and recompile the existing concrete modules. mGfStorage.upload(mDir, gfModule); result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages); } if (result != null && result.isSuccess()) { refreshGrammarInfo(); refreshLangToTokenToCats(); } return result; } /** * Recompiles the grammar. */ public GfStorageResult update() throws GfServiceException { Set<String> languages = getLanguages(); GfStorageResult result = mGfStorage.update(mDir, mCat, OPTIMIZE_PGF, languages); if (result != null && result.isSuccess()) { refreshGrammarInfo(); refreshLangToTokenToCats(); } return result; } public boolean isGrammarEditable() { return ! (mDir == null); } /** * True if the module is a concrete syntax module which no other * module imports. We check if the module name has the form * {@code GrammarLan}. This covers also modules * which were added after the wiki was started up. The previous * technique {@code languages.contains(gfModule.getName())} did not * cover the new modules. */ private boolean isToplevelModule(GfModule gfModule, Set<String> languages) { String moduleName = gfModule.getName(); if (languages.contains(moduleName)) { return true; } if (mGfServiceResultGrammar == null) { return false; } String grammarName = mGfServiceResultGrammar.getName(); return ( moduleName.startsWith(grammarName) && moduleName.length() >= grammarName.length() + 3 && Character.isUpperCase(moduleName.charAt(grammarName.length())) || moduleName.startsWith(PREFIX_DISAMB + grammarName) && moduleName.length() >= PREFIX_DISAMB.length() + grammarName.length() + 3 && Character.isUpperCase(moduleName.charAt(PREFIX_DISAMB.length() + grammarName.length())) ); } // TODO: we assume that editable directories have a certain form private static String getDir(String str) { Pattern p = Pattern.compile("(/tmp/.+)/.+"); Matcher m = p.matcher(str); if (m.matches()) { return m.group(1); } return null; } private static String getCompletionInput(List<String> tokens) { if (tokens.isEmpty()) { return ""; } return GF_TOKEN_JOINER.join(tokens) + GF_TOKEN_SEPARATOR; } private void refreshGrammarInfo() throws GfServiceException { mGfServiceResultGrammar = mGfService.grammar(); mGfServiceResultBrowseAll = mGfService.browseAll(); } /** * <p>Creates a structure from which you can look up the categories of tokens.</p> * * <pre> * language -> token -> categories * </pre> */ private void refreshLangToTokenToCats() throws GfServiceException { // Collect together all the consumer functions. // TODO We are not interested in their linearizations, at least for the time begin. Set<String> funsAllConsumers = Sets.newHashSet(); Set<String> cats = mGfServiceResultBrowseAll.getCategories(); for (String cat : cats) { funsAllConsumers.addAll(getConsumers(cat)); } int countAllFuns = mGfServiceResultGrammar.getFunctions().size(); int countIgnoreFuns = funsAllConsumers.size(); mLogger.info("All funs: {}, (ignored) consumer funs: {}", countAllFuns, countIgnoreFuns); if (countAllFuns - countIgnoreFuns > LINEARIZE_ALL_QUERY_LIMIT) { mLogger.warn("Refusing to build preditor cache, as there are too many producer-only funs. " + "Increase LINEARIZE_ALL_QUERY_LIMIT if its current value {} is too low.", LINEARIZE_ALL_QUERY_LIMIT); return; } langToTokenToCats.clear(); mCatToSize.clear(); langToIriToToken.clear(); // Iterate over all the categories that have producer functions for (String cat : cats) { mCatToSize.put(cat, 0); // For each category look at its producers for (String f : getProducers(cat)) { // If this function is also a consumer, then throw it out if (funsAllConsumers.contains(f)) { continue; } // Increment the counter of producers that are not consumers for this category mCatToSize.put(cat, mCatToSize.get(cat) + 1); // Otherwise get all of its linearizations in all the languages. // This includes all the wordforms and variants, because the linearization // is likely to be a complex record that holds many strings. GfServiceResultLinearizeAll result = mGfService.linearizeAll(f, null); Map<String, List<String>> langToTokens = result.getTexts(); // Extract the logical symbol that corresponds to this function. // The logical symbol is present in the Ape-linearization. String logicalSymbol = extractLogicalSymbolFromApe(langToTokens.get(mGfServiceResultGrammar.getName() + SUFFIX_APE)); for (Entry<String, List<String>> entry2 : langToTokens.entrySet()) { String lang = entry2.getKey(); Multimap<String, String> tokenToCats = langToTokenToCats.get(lang); // If we haven't seen this language before then create a new hash table entry for it if (tokenToCats == null) { tokenToCats = HashMultimap.create(); langToTokenToCats.put(lang, tokenToCats); } // Store each linearization together with its category. // The linearization is represented by its "most important" token. for (String lin : entry2.getValue()) { String indexToken = getIndexToken(lin); if (indexToken != null) { tokenToCats.put(indexToken, cat); } } if (logicalSymbol != null) { Map<String, String> iriToToken = langToIriToToken.get(lang); // If we haven't seen this language before then create a new hash table entry for it if (iriToToken == null) { iriToToken = Maps.newHashMap(); langToIriToToken.put(lang, iriToToken); } for (String lin : entry2.getValue()) { iriToToken.put(logicalSymbol, lin); // TODO: We assume that the dictionary form is always the first. // Of course, this does not always hold. // Unfortunately, LinearizeAll cannot be used to obtain a GF record, // with all the category labels of the strings, but just a list of plain strings. break; } } } } } } /** * It does not make sense to index linearizations which contain multiple tokens * or which are empty strings, as these cannot be matched during (single token) * lookahead editing. If there are multiple tokens in the given linearization, e.g. * the + Atlantic_Ocean, des + Atlantischen_Ozeans, Atlandi_Ookean + &+ + il; * then we return the longest token (picking the last one in case there are several). * TODO: this is a hack while we're waiting for a cleaner solution. */ private static String getIndexToken(String lin) { int max = 0; String returnTok = null; for (String tok : GF_TOKEN_SPLITTER.omitEmptyStrings().split(lin)) { if (tok.length() >= max) { max = tok.length(); returnTok = tok; } } return returnTok; } /** * <p>Extracts the logical symbol (which is used by APE as the * OWL entity IRI) from the Ape-linearization of a function, assuming * that the function is a lexical function. * Returns {@code null} in case the extraction fails.</p> * * <p>We assume that the Ape linearizations have the form * {@code The_Hague|pn_sg|The_Hague_PN|neutr}, where the logical symbol * is always in the same field and is always the same in case there are * several linearizations.</p> */ private static String extractLogicalSymbolFromApe(List<String> lins) { if (lins == null || lins.isEmpty()) { return null; } int count = 0; for (String field : GF_APE_SPLITTER.split(lins.get(0))) { if (++count == GF_APE_FIELD_LOGICAL_SYMBOL) { return field; } } return null; } }