package org.wikipedia.miner.extract.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extract.model.DumpPage;
import org.wikipedia.miner.util.MarkupStripper;
public class PageSentenceExtractor {
private static Logger logger = Logger.getLogger(PageSentenceExtractor.class) ;
private SentenceDetectorME sentenceDetector ;
private MarkupStripper stripper = new MarkupStripper() ;
//split paragraphs either after multiple new lines, or on a line that starts with a indent (:) or list(*,#) marker
private Pattern paragraphSplitPattern = Pattern.compile("((\n\\s*){2,}|\n\\s*[*:#])") ;
public PageSentenceExtractor(Path sentenceModel) throws InvalidFormatException, IOException {
InputStream sentenceModelStream = new FileInputStream(sentenceModel.toString());
init(sentenceModelStream) ;
}
public PageSentenceExtractor(File sentenceModel) throws InvalidFormatException, IOException {
InputStream sentenceModelStream = new FileInputStream(sentenceModel);
init(sentenceModelStream) ;
}
private void init(InputStream sentenceModelStream) throws InvalidFormatException, IOException {
SentenceModel model = null ;
try {
model = new SentenceModel(sentenceModelStream);
}
finally {
if (sentenceModelStream != null) {
try {
sentenceModelStream.close();
} catch (IOException e) {
logger.error("Could not close sentence model reader", e);
}
}
}
sentenceDetector = new SentenceDetectorME(model) ;
}
public List<Integer> getSentenceSplits(DumpPage page) {
String maskedMarkup = stripper.stripAllButInternalLinksAndEmphasis(page.getMarkup(), ' ') ;
maskedMarkup = stripper.stripNonArticleInternalLinks(maskedMarkup, 'a') ;
//mask links so that it is impossible to split on any punctuation within a link.
maskedMarkup = stripper.stripRegions(maskedMarkup, stripper.gatherComplexRegions(maskedMarkup, "\\[\\[", "\\]\\]"), 'a') ;
return getSentenceSplits(maskedMarkup) ;
}
public List<Integer> getSentenceSplits(String strippedMarkup) {
List<Integer> sentenceSplits = new ArrayList<Integer>() ;
//also mask content in brackets, so it is impossible to split within these.
String maskedMarkup = stripper.stripRegions(strippedMarkup, stripper.gatherComplexRegions(strippedMarkup, "\\(", "\\)"), 'a') ;
Matcher paragraphMatcher = paragraphSplitPattern.matcher(maskedMarkup) ;
int lastParagraphEnd = 0;
while (paragraphMatcher.find()) {
int paragraphEnd = paragraphMatcher.start();
String paragraph = maskedMarkup.substring(lastParagraphEnd, paragraphEnd) ;
if (paragraph.trim().length() == 0)
continue ;
sentenceSplits = handleParagraph(paragraph, lastParagraphEnd, sentenceSplits) ;
lastParagraphEnd = paragraphMatcher.end() ;
}
sentenceSplits = handleParagraph(maskedMarkup.substring(lastParagraphEnd), lastParagraphEnd, sentenceSplits) ;
return sentenceSplits ;
}
public List<Integer> handleParagraph(String paragraph, int paragraphStart, List<Integer> sentenceSplits) {
if (paragraphStart > 0)
sentenceSplits.add(paragraphStart) ;
Span[] spans = sentenceDetector.sentPosDetect(paragraph) ;
for (int spanIndex = 0 ; spanIndex < spans.length - 1 ; spanIndex++) {
//add splits for all spans except for the last one (that split gets handled at the paragraph level
Span span = spans[spanIndex] ;
sentenceSplits.add(paragraphStart + span.getEnd()) ;
//System.out.println(" - " + (span.getStart() + paragraphStart) + "," + (span.getEnd() + paragraphStart));
}
return sentenceSplits ;
}
}