package info.ephyra.nlp;
import info.ephyra.util.HTMLConverter;
/**
* Extracts sentences and text fragments from an HTML document.
*
* @author Nico Schlaefer
* @version 2005-09-12
*/
public class SentenceExtractor {
/**
* Regular expression that describes non-structuring tags, i.e. tags that
* appear within a sentence and that are not sentence delimiters. All other
* tags are assumed to be sentence delimiters.
*/
private static final String NON_STRUC_TAGS = "(?i)" +
"<b( .*?)?>|</b>|<i( .*?)?>|</i>|<u( .*?)?>|</u>|<sup( .*?)?>|</sup>" +
"|<sub( .*?)?>|</sub>|<tt( .*?)?>|</tt>|<font( .*?)?>|</font>" +
"|<small( .*?)?>|</small>|<big( .*?)?>|</big>|<a( .*?)?>|</a>" +
"|<br>|<nobr>";
/**
* Extracts sentences from an HTML document
*
* @param html the HTML document
* @return sentences extracted from the document
*/
public static String[] getSentencesFromHtml(String html) {
// handle special characters
html = HTMLConverter.replaceSpecialCharacters(html);
// drop non-structuring tags
html = html.replaceAll(NON_STRUC_TAGS, "");
// replace all structuring tags by the sentence delimiter tag <delim>
html = html.replaceAll("<.*?>", "<delim>");
// insert the <delim> tag between all sentences
html = html.replaceAll("\\. ", "\\.<delim>");
html = html.replaceAll("! ", "!<delim>");
html = html.replaceAll("\\? ", "\\?<delim>");
// replace all sequences of whitespace characters by single blanks
html = html.replaceAll("\\s+", " ");
// remove multiple <delim> tags
html = html.replaceAll(" ?<delim>( |<delim>)*", "<delim>");
// remove whitespaces and <delim> tags at beginning/end of the document
html = html.replaceAll("\\A( |<delim>)|( |<delim>)\\z", "");
// split the document into sentences
String[] sentences = html.split("<delim>");
return sentences;
}
}