package ch.uzh.ifi.attempto.acewiki.gf; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import org.semanticweb.owlapi.model.OWLAxiom; import org.semanticweb.owlapi.model.OWLEntity; import com.google.common.base.Joiner; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.collect.Multiset; import com.google.common.collect.Sets; import ch.uzh.ifi.attempto.acewiki.core.AceWikiEngine; import ch.uzh.ifi.attempto.acewiki.core.GeneralTopic; import ch.uzh.ifi.attempto.acewiki.core.OntologyExporter; import ch.uzh.ifi.attempto.acewiki.core.Sentence; import ch.uzh.ifi.attempto.ape.ACEParserResult; import ch.uzh.ifi.attempto.ape.ACEText; import ch.uzh.ifi.attempto.ape.OutputType; import ch.uzh.ifi.attempto.gfservice.GfTree; import ch.uzh.ifi.attempto.gfservice.GfTreeParseException; /** * <p>Generates a report that covers all the articles and their sentences in the wiki, * along with the OWL mapping of the sentences. Reports various ambiguities:</p> * * <ul> * <li>number of trees per sentence</li> * <li>number of ACE sentences per sentence</li> * <li>number of OWL axioms per sentence</li> * </ul> * * <p>The format is optimized to be both easy to read and easy to process with grep+sed, e.g. * to get a list of all ambiguity types filter the output through:</p> * * <pre> * grep "___a" | sed "s/.*___t/t/" | sort | uniq -c | sort -nr * * 78 t1___a1___o1 * 13 t1___a0___o0 * 8 t1___a1___o0 * 7 t2___a1___o1 * 3 t4___a2___o2 * ... * </pre> * * TODO: currently experimental and GF specific * * @author Kaarel Kaljurand */ public class GfReportExporter extends OntologyExporter { private static final String TAG_GF_SENTENCE = "gf_sentence"; private static final String MAX_INDENT = "\t\t\t\t\t"; private static final Joiner JOINER = Joiner.on("___").useForNull("NULL"); protected void writeContent(String language) throws IOException { StringBuilder sb = new StringBuilder(); Multiset<String> statistics = HashMultiset.create(); AceWikiEngine engine = getOntology().getEngine(); for (GeneralTopic el : getOntologyElements(GeneralTopic.class)) { List<Sentence> sentences = el.getArticle().getSentences(); statistics.add("ontology_element"); sb.append(el.getWord()); addWithIndent(sb, 0, JOINER.join(el, sentences.size())); for (Sentence s : sentences) { statistics.add("sentence"); if (s instanceof GfSentence && engine instanceof GfEngine) { GfGrammar gfGrammar = ((GfEngine) engine).getGfGrammar(); statistics.add(TAG_GF_SENTENCE); GfSentence gfSent = (GfSentence) s; List<String> trees = gfSent.getParseTrees(); statistics.add(TAG_GF_SENTENCE + "_tree_size_" + trees.size()); AceReport aceReport = new AceReport(gfGrammar, trees); statistics.add(TAG_GF_SENTENCE + "_ace_size_" + aceReport.getAceAmbiguity()); statistics.add(TAG_GF_SENTENCE + "_owl_size_" + aceReport.getOwlAmbiguity()); String lastEditLanguage = gfSent.getGfWikiEntry().getLanguage(); statistics.add(TAG_GF_SENTENCE + "_language_" + lastEditLanguage); addWithIndent(sb, 1, JOINER.join(s.isIntegrated(), lastEditLanguage, gfSent.getGfWikiEntry().getText(), "t" + s.getNumberOfRepresentations(), "a" + aceReport.getAceAmbiguity(), "o" + aceReport.getOwlAmbiguity()) ); int totalTreeSize = 0; int totalOwlSize = 0; for (String tree : trees) { statistics.add(TAG_GF_SENTENCE + "_tree"); addWithIndent(sb, 2, tree); int treeSize = aceReport.getTreeSize(tree); totalTreeSize += treeSize; int owlSize = aceReport.getOwlSize(tree); totalOwlSize += owlSize; addWithIndent(sb, 3, "tree_size_" + treeSize); addWithIndent(sb, 3, aceReport.getAce(tree)); addWithIndent(sb, 3, aceReport.getOwlFssPp(tree)); addWithIndent(sb, 3, "owl_size_" + owlSize); addWithIndent(sb, 3, aceReport.getMessages(tree)); } if (! trees.isEmpty()) { statistics.add(TAG_GF_SENTENCE + "_trees_treesize_" + (totalTreeSize / trees.size())); statistics.add(TAG_GF_SENTENCE + "_trees_owlsize_" + (totalOwlSize / trees.size())); } } else { addWithIndent(sb, 1, JOINER.join(s.isIntegrated(), s.toString(), s.getNumberOfRepresentations())); } } } // Show some overall statistics, e.g. how many sentences have 0, 1, 1+ corresponding OWL representations SortedSet<String> sortedSet = Sets.newTreeSet(statistics.elementSet()); for (String key : sortedSet) { addWithIndent(sb, 0, JOINER.join(key, statistics.count(key))); } write(sb.toString()); } public String getName() { return "GF Report"; } public boolean isApplicable() { return true; } public String getFileSuffix() { return ".gfreport.txt"; } public String getContentType() { return "text/plain"; } private void addWithIndent(StringBuilder sb, int level, String str) { String indent = MAX_INDENT.substring(0, level); sb.append(indent); if (str == null) { sb.append("NULL"); } else { // Add indent after every newline sb.append(str.replaceAll("\\n", "\n" + indent)); } sb.append('\n'); } private class AceReport { // Each tree corresponds to a set of 0 or more OWL axioms. // We store these sets into a single set. The number of elements in this set shows // the semantic ambiguity of the original sentence, which is smaller // or equal to the syntactic ambiguity (shown by the number of trees). // TODO: throw out axioms which have a semantically equivalent axiom in the set private Set<Set<OWLAxiom>> setOfSetofAxiom = Sets.newHashSet(); private Map<String, ACEParserResult> treeToAceParserResult = Maps.newHashMap(); // Linearization result private Map<String, String> treeToAce = Maps.newHashMap(); // Linearization result that was correct ACE private Map<String, String> treeToAceParsed = Maps.newHashMap(); // Pretty-printed OWL private Map<String, String> treeToOwl = Maps.newHashMap(); private Map<String, Integer> treeToOwlSize = Maps.newHashMap(); public AceReport(GfGrammar gfGrammar, List<String> trees) { for (String tree : trees) { if (tree == null) { continue; } try { ACEText acetext = GfWikiUtils.getACEText(gfGrammar, tree); if (acetext == null) continue; String acetextAsString = acetext.getText().trim(); if (acetextAsString.isEmpty()) continue; treeToAce.put(tree, acetextAsString); ACEParserResult parserResult = GfWikiUtils.parse(acetext, getOntology().getURI(), OutputType.DRS, OutputType.OWLFSSPP); treeToAceParserResult.put(tree, parserResult); String drsAsString = parserResult.get(OutputType.DRS); if ("drs([],[])".equals(drsAsString)) continue; treeToAceParsed.put(tree, acetextAsString); String owlAsString = parserResult.get(OutputType.OWLFSSPP); Set<OWLAxiom> axiomSet = GfOwlConverter.getOwlAxiomsFromString(owlAsString); if (axiomSet.isEmpty()) continue; setOfSetofAxiom.add(axiomSet); treeToOwl.put(tree, owlAsString); // Number of different entities in the axiom set Set<OWLEntity> entities = Sets.newHashSet(); for (OWLAxiom ax : axiomSet) { entities.addAll(ax.getSignature()); } treeToOwlSize.put(tree, entities.size()); } catch (Exception e) { continue; } } } public int getOwlAmbiguity() { return setOfSetofAxiom.size(); } public int getAceAmbiguity() { return ImmutableSet.copyOf(treeToAceParsed.values()).size(); } public String getAce(String tree) { return treeToAce.get(tree); } public int getTreeSize(String tree) { try { GfTree gfTree = new GfTree(tree); return gfTree.size(); } catch (GfTreeParseException e) { return 0; } } public String getOwlFssPp(String tree) { return treeToOwl.get(tree); } public int getOwlSize(String tree) { Integer size = treeToOwlSize.get(tree); if (size == null) { return 0; } return size; } public String getMessages(String tree) { ACEParserResult aceParserResult = treeToAceParserResult.get(tree); if (aceParserResult == null) { return null; } return Joiner.on('\n').join(aceParserResult.getMessageContainer().getMessages()); } } }