package edu.uncc.cs.watsonsim.nlp; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.CharSetUtils; import org.apache.log4j.Logger; import alice.tuprolog.InvalidTheoryException; import alice.tuprolog.MalformedGoalException; import alice.tuprolog.NoMoreSolutionException; import alice.tuprolog.NoSolutionException; import alice.tuprolog.Prolog; import alice.tuprolog.SolveInfo; import alice.tuprolog.Theory; import alice.tuprolog.UnknownVarException; import com.google.common.io.Files; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.util.Pair; import edu.uncc.cs.watsonsim.Phrase; public class SupportCandidateType { private static final Logger log = Logger.getLogger(SupportCandidateType.class); private static final Prolog static_engine = new Prolog(); static { try { static_engine.setTheory(new Theory( Files.toString(new File("src/main/parse.pl"), Charset.forName("UTF-8")))); } catch (InvalidTheoryException | IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private static String clean(String text) { return CharSetUtils.keep(text.toLowerCase(), "abcdefghijklmnopqrstuvwxyz_"); } private static String wordID(IndexedWord word) { return "w" + clean(word.word()) + "_" + word.index(); } private static IndexedWord idWord(SemanticGraph graph, String id) { int idx = Integer.parseInt(id.substring(id.lastIndexOf('_')+1)); return graph.getNodeByIndex(idx); } /** * Find simple statements of type in regular text, such as "Diabetes is a * common disease" * * Subclasses are very similarly stated, such as "A hummingbird is a kind * of bird." But we don't distinguish between these yet. We should though. * * @return Pairs of nouns and their types. */ public static List<Pair<String, String>> extract(Phrase p) { List<Pair<String, String>> names_and_types = new ArrayList<>(); for (SemanticGraph graph: p.getGraphs()){ StringBuilder theory = new StringBuilder(); // Load data into a model // Add all the edges for (SemanticGraphEdge edge : graph.edgeIterable()) { // I like the specific prepositions better // so change them to match GrammaticalRelation rel = edge.getRelation(); String relation_name = rel.getShortName(); if ( (rel.getShortName().equals("prep") || rel.getShortName().equals("conj")) && rel.getSpecific() != null && !rel.getSpecific().isEmpty()) { relation_name = rel.getShortName() + "_" + CharSetUtils.keep(rel.getSpecific().toLowerCase(), "abcdefghijklmnopqrstuvwxyz"); } theory.append(relation_name); theory.append('('); theory.append(wordID(edge.getGovernor())); theory.append(','); theory.append(wordID(edge.getDependent())); theory.append(").\n"); } // Index the words for (IndexedWord word : graph.vertexSet()) { theory.append("tag("); theory.append(wordID(word)); theory.append(','); String tag = clean(word.tag()); theory.append(tag.isEmpty() ? "misc" : tag); theory.append(").\n"); } Prolog engine = new Prolog(); try { engine.setTheory(new Theory( Files.toString(new File("src/main/parse.pl"), Charset.forName("UTF-8")))); log.debug(theory); engine.addTheory(new Theory(theory.toString())); SolveInfo info = engine.solve("type_c(X, Y)."); // Get the resulting matches while (info.isSuccess()) { IndexedWord subj_idx = idWord(graph, info.getTerm("X").toString()); IndexedWord obj_idx = idWord(graph, info.getTerm("Y").toString()); if (subj_idx.tag().startsWith("NN") && obj_idx.tag().startsWith("NN")) { String noun = Trees.concatNoun(graph, subj_idx); String type = obj_idx.originalText(); //concatNoun(graph, obj_idx); log.info("Discovered " + noun + " is a(n) " + type); names_and_types.add(new Pair<>(noun,type)); } if (engine.hasOpenAlternatives()) { info = engine.solveNext(); } else { break; } } } catch (IOException | InvalidTheoryException | MalformedGoalException | NoSolutionException | NoMoreSolutionException | UnknownVarException e) { System.out.println(theory); e.printStackTrace(); } } return names_and_types; } }