package arkref.parsestuff; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.lang.StringUtils; import arkref.analysis.ARKref; import arkref.sent.SentenceBreaker; import com.aliasi.util.Strings; //import net.didion.jwnl.data.POS; //import net.didion.jwnl.dictionary.Dictionary; import edu.cmu.ark.DiscriminativeTagger; import edu.cmu.ark.LabeledSentence; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.parser.lexparser.*; import edu.stanford.nlp.process.DocumentPreprocessor; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import edu.stanford.nlp.util.Pair; //import net.didion.jwnl.*; /** Various NL analysis utilities, including ones wrapping Stanford subsystems and other misc stuff **/ public class AnalysisUtilities { public static boolean DEBUG = true; private AnalysisUtilities(){ parser = null; sst = null; dp = new DocumentPreprocessor(false); // try{ // JWNL.initialize(new FileInputStream(properties.getProperty("jwnlPropertiesFile", "config/file_properties.xml"))); // }catch(Exception e){ // e.printStackTrace(); // } // // conjugator = new VerbConjugator(); // conjugator.load(properties.getProperty("verbConjugationsFile", "verbConjugations.txt")); headfinder = new CollinsHeadFinder(); tree_factory = new LabeledScoredTreeFactory(); tlp = new PennTreebankLanguagePack(); } protected static String preprocess(String sentence) { sentence = sentence.trim(); if(!sentence.matches(".*\\.['\"]*$")){//charAt(sentence.length()-1) != '.'){ sentence += "."; } sentence = sentence.replaceAll("can't", "can not"); sentence = sentence.replaceAll("won't", "will not"); sentence = sentence.replaceAll("n't", " not"); //aren't shouldn't don't isn't return sentence; } protected static String preprocessTreeString(String sentence) { sentence = sentence.replaceAll(" n't", " not"); sentence = sentence.replaceAll("\\(MD ca\\)", "(MD can)"); sentence = sentence.replaceAll("\\(MD wo\\)", "(MD will)"); sentence = sentence.replaceAll("\\(MD 'd\\)", "(MD would)"); sentence = sentence.replaceAll("\\(VBD 'd\\)", "(VBD had)"); sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)"); sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)"); sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)"); sentence = sentence.replaceAll("\\(VBP 're\\)", "(VBP are)"); return sentence; } public static int[] alignTokens(String rawText, List<arkref.data.Word> words) { String[] tokens = new String[words.size()]; for (int i=0; i < words.size(); i++) { tokens[i] = words.get(i).token; } return alignTokens(rawText, tokens); } public static int[] alignTokens(String rawText, String[] tokens) { int MAX_ALIGNMENT_SKIP = 100; int[] alignments = new int[tokens.length]; int curPos = 0; tok_loop: for (int i=0; i < tokens.length; i++) { String tok = tokens[i]; // U.pf("TOKEN [%s] : ", tok); for (int j=0; j < MAX_ALIGNMENT_SKIP; j++) { boolean directMatch = rawText.regionMatches(curPos + j, tok, 0, tok.length()); if (!directMatch) directMatch = rawText.toLowerCase().regionMatches(curPos + j, tok.toLowerCase(), 0, tok.length()); boolean alternateMatch = false; if (!directMatch) { int roughLast = curPos+j+tok.length()*2+10; String substr = StringUtils.substring(rawText, curPos+j, roughLast); Matcher m = tokenSurfaceMatches(tok).matcher(substr); // U.pl("PATTERN "+ tokenSurfaceMatches(tok)); alternateMatch = m.find() && m.start()==0; } // U.pl("MATCHES "+ directMatch + " " + alternateMatch); if (directMatch || alternateMatch) { alignments[i] = curPos+j; if (directMatch) curPos = curPos+j+tok.length(); else curPos = curPos+j+1; // U.pf("\n Aligned to pos=%d : [%s]\n", alignments[i], U.backslashEscape(StringUtils.substring(rawText, alignments[i], alignments[i]+10))); continue tok_loop; } // U.pf("%s", U.backslashEscape(StringUtils.substring(rawText,curPos+j,curPos+j+1))); } U.pf("FAILED MATCH for token [%s]\n", tok); U.pl("sentence: "+rawText); U.pl("tokens: " + StringUtils.join(tokens," ")); alignments[i] = -1; } // TODO backoff for gaps .. at least guess the 2nd gap position or something (2nd char after previous token ends...) return alignments; } /** undo penn-treebankification of tokens. want to match raw original form if possible. **/ public static Pattern tokenSurfaceMatches(String tok) { if (tok.equals("-LRB-")) { return Pattern.compile("[(\\[]"); } else if (tok.equals("-RRB-")) { return Pattern.compile("[)\\]]"); } else if (tok.equals("``")) { return Pattern.compile("(\"|``)"); } else if (tok.equals("''")) { return Pattern.compile("(\"|'')"); } else if (tok.equals("`")) { return Pattern.compile("('|`)"); } return Pattern.compile(Pattern.quote(tok)); } public String[] stanfordTokenize(String str) { List<Word> wordToks = AnalysisUtilities.getInstance().dp.getWordsFromString(str); String[] tokens = new String[wordToks.size()]; for (int i=0; i < wordToks.size(); i++) tokens[i] = wordToks.get(i).value(); return tokens; } public static List <SentenceBreaker.Sentence> cleanAndBreakSentences(String docText) { // ACE IS EVIL docText = docText.replaceAll("<\\S+>", ""); AlignedSub cleaner = AnalysisUtilities.cleanupDocument(docText); List<SentenceBreaker.Sentence> sentences = SentenceBreaker.getSentences(cleaner); return sentences; } public static List <String> cleanAndBreakSentencesToText(String docText) { List <String> sentenceTexts = new ArrayList<String>(); for (SentenceBreaker.Sentence s : cleanAndBreakSentences(docText)) sentenceTexts.add( s.cleanText ); return sentenceTexts; } /** uses stanford library for document cleaning and sentence breaking **/ public List<String> getSentencesStanford(String document) { List<String> res = new ArrayList<String>(); String sentence; StringReader reader = new StringReader(cleanupDocument(document).text); List<List<? extends HasWord>> sentences = new ArrayList<List<? extends HasWord>>(); Iterator<List<? extends HasWord>> iter1 ; Iterator<? extends HasWord> iter2; try{ sentences = dp.getSentencesFromText(reader); }catch(Exception e){ e.printStackTrace(); } iter1 = sentences.iterator(); while(iter1.hasNext()){ iter2 = iter1.next().iterator(); sentence = ""; while(iter2.hasNext()){ String tmp = iter2.next().word().toString(); sentence += tmp; if(iter2.hasNext()){ sentence += " "; } } res.add(sentence); } return res; } static Pattern leadingWhitespace = Pattern.compile("^\\s+"); /** some ACE docs have weird markup in them that serve as paragraph-ish markers **/ public static AlignedSub cleanupDocument(String document) { AlignedSub ret = new AlignedSub(document); ret = ret.replaceAll("<\\S+>", ""); ret = ret.replaceAll(leadingWhitespace, ""); // sentence breaker char offset correctness sensitive to this return ret; } public static AlignedSub moreCleanup(String str) { AlignedSub ret = new AlignedSub(str); ret = ret.replaceAll("&(amp|AMP);", "&"); ret = ret.replaceAll("&(lt|LT);", "<"); ret = ret.replaceAll("&(gt|GT);", ">"); return ret; } // public VerbConjugator getConjugator(){ // return conjugator; // } public CollinsHeadFinder getHeadFinder(){ return headfinder; } public static AnalysisUtilities getInstance(){ if(instance == null){ instance = new AnalysisUtilities(); } return instance; } public double getLastParseScore(){ return lastParseScore; } public Double getLastParseScoreNormalizedByLength() { double length = lastParse.yield().length(); double res = lastParseScore; if(length <= 0){ res = 0.0; }else{ res /= length; } return res; } public static class ParseResult { public boolean success; public Tree parse; public double score; public ParseResult(boolean s, Tree p, double sc) { success=s; parse=p; score=sc; } } public ParseResult parseSentence(String sentence) { String result = ""; //see if a parser socket server is available int port = new Integer(ARKref.getProperties().getProperty("parserServerPort","5556")); String host = "127.0.0.1"; Socket client; PrintWriter pw; BufferedReader br; String line; try{ client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); //flush to complete the transmission while((line = br.readLine())!= null){ //if(!line.matches(".*\\S.*")){ // System.out.println(); //} if(br.ready()){ line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; }else{ lastParseScore = new Double(line); } } br.close(); pw.close(); client.close(); System.err.println("parser output:"+ result); lastParse = readTreeFromString(result); boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))"); return new ParseResult(success, lastParse, lastParseScore); } catch (Exception ex) { //ex.printStackTrace(); } //if socket server not available, then use a local parser object if (parser == null) { if(DEBUG) System.err.println("Could not connect to parser server. Loading parser..."); try { Options op = new Options(); String serializedInputFileOrUrl = ARKref.getProperties().getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz"); parser = new LexicalizedParser(serializedInputFileOrUrl, op); int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength", "40")).intValue(); parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try{ if (parser.parse(sentence)) { lastParse = parser.getBestParse(); lastParseScore = parser.getPCFGScore(); TreePrint tp = new TreePrint("penn","",new PennTreebankLanguagePack()); StringWriter sb = new StringWriter(); pw = new PrintWriter(sb); tp.printTree(lastParse, pw); pw.flush(); lastParse = readTreeFromString(sb.getBuffer().toString()); return new ParseResult(true, lastParse, lastParseScore); } }catch(Exception e){ } lastParse = readTreeFromString("(ROOT (. .))"); lastParseScore = -99999.0; return new ParseResult(false, lastParse, lastParseScore); } // @SuppressWarnings("unchecked") // public String getLemma(Tree tensedverb){ // if(tensedverb == null){ // return ""; // } // // String res = ""; // Pattern p = Pattern.compile("\\(\\S+ ([^\\)]*)\\)"); // Matcher m = p.matcher(tensedverb.toString()); // m.find(); // res = m.group(1); // // if(res.equals("is") || res.equals("are") || res.equals("were") || res.equals("was")){ // res = "be"; // }else{ // try{ // Iterator<String> iter = Dictionary.getInstance().getMorphologicalProcessor().lookupAllBaseForms(POS.VERB, res).iterator(); // // int maxCount = -1; // int tmpCount; // while(iter.hasNext()){ // String lemma = iter.next(); // tmpCount = conjugator.getBaseFormCount(lemma); // //System.err.println("lemma: "+lemma + "\tcount: "+tmpCount); // if(tmpCount > maxCount){ // res = lemma; // maxCount = tmpCount; // } // } // }catch(Exception e){ // e.printStackTrace(); // } // } // // return res; // } public List<String> annotateSentenceWithSupersenses(Tree sentence) { List<String> result = new ArrayList<String>(); int numleaves = sentence.getLeaves().size(); if(numleaves <= 1){ return result; } LabeledSentence labeled = generateSupersenseTaggingInput(sentence); //see if a NER socket server is available int port = new Integer(ARKref.getProperties().getProperty("supersenseServerPort","5557")); String host = "127.0.0.1"; Socket client; PrintWriter pw; BufferedReader br; String line; try{ client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); String inputStr = ""; for(int i=0;i<labeled.length(); i++){ String token = labeled.getTokens().get(i); String stem = labeled.getStems().get(i); String pos = labeled.getPOS().get(i); inputStr += token+"\t"+stem+"\t"+pos+"\n"; } pw.println(inputStr); pw.flush(); //flush to complete the transmission while((line = br.readLine())!= null){ String [] parts = line.split("\\t"); result.add(parts[2]); } br.close(); pw.close(); client.close(); } catch (Exception ex) { if(ARKref.Opts.debug) System.err.println("Could not connect to SST server."); //ex.printStackTrace(); } //if socket server not available, then use a local NER object if(result.size() == 0){ try { if(sst == null){ DiscriminativeTagger.loadProperties(ARKref.getPropertiesPath()); sst = DiscriminativeTagger.loadModel(ARKref.getProperties().getProperty("supersenseModelFile", "config/supersenseModel.ser.gz")); } sst.findBestLabelSequenceViterbi(labeled, sst.getWeights()); for(String pred: labeled.getPredictions()){ result.add(pred); } } catch (Exception e){ e.printStackTrace(); } } //add a bunch of blanks if necessary while(result.size() < numleaves) result.add("0"); if(ARKref.Opts.debug) System.err.println("annotateSentenceSST: "+result); return result; } private LabeledSentence generateSupersenseTaggingInput(Tree sentence){ LabeledSentence res = new LabeledSentence(); List<Tree> leaves = sentence.getLeaves(); for(int i=0;i<leaves.size();i++){ String word = leaves.get(i).label().toString(); Tree preterm = leaves.get(i).parent(sentence); String pos = preterm.label().toString(); String stem = AnalysisUtilities.getInstance().getLemma(word, pos); res.addToken(word, stem, pos, "0"); } return res; } /** * Remove traces and non-terminal decorations (e.g., "-SUBJ" in "NP-SUBJ") from a Penn Treebank-style tree. * * @param inputTree */ public void normalizeTree(Tree inputTree){ inputTree.label().setFromString("ROOT"); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; TregexMatcher matcher; tregexOpStr = "/\\-NONE\\-/=emptynode"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); ps.add(Tsurgeon.parseOperation("prune emptynode")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern,TsurgeonPattern>(matchPattern,p)); Tsurgeon.processPatternsOnTree(ops, inputTree); Label nonterminalLabel; tregexOpStr = "/.+\\-.+/=nonterminal < __"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); while(matcher.find()){ nonterminalLabel = matcher.getNode("nonterminal"); if(nonterminalLabel == null) continue; nonterminalLabel.setFromString(tlp.basicCategory(nonterminalLabel.value())); } } public static String getCleanedUpYield(Tree inputTree){ Tree copyTree = inputTree.deeperCopy(); if(DEBUG)System.err.println(copyTree.toString()); String res = copyTree.yield().toString(); if(res.length() > 1){ res = res.substring(0,1).toUpperCase() + res.substring(1); } //(ROOT (S (NP (NNP Jaguar) (NNS shares)) (VP (VBD skyrocketed) (NP (NN yesterday)) (PP (IN after) (NP (NP (NNP Mr.) (NNP Ridley) (POS 's)) (NN announcement)))) (. .))) res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1"); res = res.replaceAll("(\\$)\\s", "$1"); res = res.replaceAll("can not", "cannot"); res = res.replaceAll("\\s*-LRB-\\s*", " ("); res = res.replaceAll("\\s*-RRB-\\s*", ") "); res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 "); res = res.replaceAll("\\s+''", "''"); //res = res.replaceAll("\"", ""); res = res.replaceAll("``\\s+", "``"); res = res.replaceAll("\\-[LR]CB\\-", ""); //brackets, e.g., [sic] //remove extra spaces res = res.replaceAll("\\s\\s+", " "); res = res.trim(); return res; } public Tree readTreeFromString(String parseStr){ //read in the input into a Tree data structure TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), tree_factory); Tree inputTree = null; try{ inputTree = treeReader.readTree(); }catch(IOException e){ e.printStackTrace(); } return inputTree; } protected static boolean filterSentenceByPunctuation(String sentence) { //return (sentence.indexOf("\"") != -1 //|| sentence.indexOf("''") != -1 //|| sentence.indexOf("``") != -1 //|| sentence.indexOf("*") != -1); return (sentence.indexOf("*") != -1); } /** * Sets the parse and score. * For use when the input tree is given (e.g., for gold standard trees from a treebank) * * @param parse * @param score */ public void setLastParseAndScore(Tree parse, double score){ lastParse = parse; lastParseScore = score; } /** * terse representation of a (sub-)tree: * NP[the white dog] -vs- (NP (DT the) (JJ white) (NN dog)) **/ public static String abbrevTree(Tree tree) { ArrayList<String> toks = new ArrayList(); for (Tree L : tree.getLeaves()) { toks.add(L.label().toString()); } return tree.label().toString() + "[" + StringUtils.join(toks, " ") + "]"; } private void loadWordnetMorphologyCache() { morphMap = new HashMap<String, Map<String, String>>(); try{ BufferedReader br; String buf; String[] parts; String morphFile = ARKref.getProperties().getProperty("morphFile","config/MORPH_CACHE.gz"); br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(morphFile)))); while((buf = br.readLine())!= null){ parts = buf.split("\\t"); addMorph(parts[1], parts[0], parts[2]); addMorph(parts[1], "UNKNOWN", parts[2]); } br.close(); addMorph("men", "NNS", "man"); }catch(IOException e){ e.printStackTrace(); } } private void addMorph(String word, String pos, String stem){ Map<String, String> posMap = morphMap.get(pos); if(posMap == null){ posMap = new HashMap<String, String>(); morphMap.put(pos.intern(), posMap); } posMap.put(word.intern(), stem.intern()); } public String getLemma(String word, String pos){ if(morphMap == null){ loadWordnetMorphologyCache(); } String res = word; Map<String, String> posMap = morphMap.get(pos); if(posMap != null){ res = posMap.get(word.toLowerCase()); if(res == null){ res = word.toLowerCase(); } } return res; } private Map<String, Map<String, String>> morphMap; //pos, word -> stem private DiscriminativeTagger sst; private LexicalizedParser parser; private static AnalysisUtilities instance; // private VerbConjugator conjugator; private CollinsHeadFinder headfinder; private LabeledScoredTreeFactory tree_factory; private PennTreebankLanguagePack tlp; private double lastParseScore; private Tree lastParse; public DocumentPreprocessor dp; }