package edu.stanford.nlp.ie.machinereading.domains.ace.reader; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import edu.stanford.nlp.ie.machinereading.common.DomReader; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.RobustTokenizer.WordToken; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.util.Generics; public class AceSentenceSegmenter extends DomReader { // list of tokens which mark sentence boundaries private final static String[] sentenceFinalPunc = new String[] { ".", "!", "?" }; private static Set<String> sentenceFinalPuncSet = Generics.newHashSet(); static { // set up sentenceFinalPuncSet for (String aSentenceFinalPunc : sentenceFinalPunc) sentenceFinalPuncSet.add(aSentenceFinalPunc); } /** * @param filenamePrefix * path to an ACE .sgm file (but not including the .sgm extension) */ public static List<List<AceToken>> tokenizeAndSegmentSentences(String filenamePrefix) throws IOException, SAXException, ParserConfigurationException { List<List<AceToken>> sentences = new ArrayList<>(); File inputFile = new File(filenamePrefix + AceDocument.ORIG_EXT); String input =IOUtils.slurpFile(inputFile); // now we can split the text into tokens RobustTokenizer<Word> tokenizer = new RobustTokenizer<>(input); List<WordToken> tokenList = tokenizer.tokenizeToWordTokens(); // and group the tokens into sentences ArrayList<AceToken> currentSentence = new ArrayList<>(); int quoteCount = 0; for (int i = 0; i < tokenList.size(); i ++){ WordToken token = tokenList.get(i); String tokenText = token.getWord(); AceToken convertedToken = wordTokenToAceToken(token, sentences.size()); // start a new sentence if we skipped 2+ lines (after datelines, etc.) // or we hit some SGML // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) { if(AceToken.isSgml(tokenText)) { if (currentSentence.size() > 0) sentences.add(currentSentence); currentSentence = new ArrayList<>(); quoteCount = 0; } currentSentence.add(convertedToken); if(tokenText.equals("\"")) quoteCount ++; // start a new sentence whenever we hit sentence-final punctuation if (sentenceFinalPuncSet.contains(tokenText)){ // include quotes after EOS if(i < tokenList.size() - 1 && quoteCount % 2 == 1 && tokenList.get(i + 1).getWord().equals("\"")){ AceToken quoteToken = wordTokenToAceToken(tokenList.get(i + 1), sentences.size()); currentSentence.add(quoteToken); quoteCount ++; i ++; } if (currentSentence.size() > 0) sentences.add(currentSentence); currentSentence = new ArrayList<>(); quoteCount = 0; } // start a new sentence when we hit an SGML tag else if(AceToken.isSgml(tokenText)) { if (currentSentence.size() > 0) sentences.add(currentSentence); currentSentence = new ArrayList<>(); quoteCount = 0; } } return sentences; } public static AceToken wordTokenToAceToken(WordToken wordToken, int sentence) { return new AceToken(wordToken.getWord(), "", "", "", "", Integer .toString(wordToken.getStart()), Integer.toString(wordToken.getEnd()), sentence); } // simple testing code public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException { String testFilename = "/home/mcclosky/data/ACE2005/English/wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1144"; // testFilename = // "/home/mcclosky/data/ACE2005/English/bc/timex2norm/CNN_CF_20030303.1900.02"; // testFilename = // "/home/mcclosky/data/ACE2005/English/un/timex2norm/alt.atheism_20041104.2428"; testFilename = "/home/mcclosky/data/ACE2005/English/nw/timex2norm/AFP_ENG_20030502.0614"; List<List<AceToken>> sentences = tokenizeAndSegmentSentences(testFilename); for (List<AceToken> sentence : sentences) System.out.println("s: [" + sentence + "]"); } }