package edu.stanford.nlp.process;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.Generics;
/**
* A <code>Processor</code> whose <code>process</code> method deletes all
* SGML/XML/HTML tags (tokens starting with <code><</code> and ending
* with <code>><code>. Optionally, newlines can be inserted after the
* end of block-level tags to roughly simulate where continuous text was
* broken up (this helps finding sentence boundaries for example).
*
* @author Christopher Manning
* @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
*
* @param <L> The type of the labels
* @param <F> The type of the features
*/
public class StripTagsProcessor<L, F> extends AbstractListProcessor<Word, Word, L, F> {
private static final Set<String> BLOCKTAGS = Generics.newHashSet(Arrays.asList(
"blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "table", "tr", "ul"));
/**
* Block-level HTML tags that are rendered with surrounding line breaks.
*/
public static final Set<String> blockTags = BLOCKTAGS;
/**
* Whether to insert "\n" words after ending block tags.
*/
private boolean markLineBreaks;
/**
* Constructs a new StripTagsProcessor that doesn't mark line breaks.
*/
public StripTagsProcessor() {
this(false);
}
/**
* Constructs a new StripTagProcessor that marks line breaks as specified.
*/
public StripTagsProcessor(boolean markLineBreaks) {
setMarkLineBreaks(markLineBreaks);
}
/**
* Returns whether the output of the processor will contain newline words
* ("\n") at the end of block-level tags.
*
* @return Whether the output of the processor will contain newline words
* ("\n") at the end of block-level tags.
*/
public boolean getMarkLineBreaks() {
return (markLineBreaks);
}
/**
* Sets whether the output of the processor will contain newline words
* ("\n") at the end of block-level tags.
*/
public void setMarkLineBreaks(boolean markLineBreaks) {
this.markLineBreaks = markLineBreaks;
}
/**
* Returns a new Document with the same meta-data as <tt>in</tt>,
* and the same words except tags are stripped.
*/
public List<Word> process(List<? extends Word> in) {
List<Word> out = new ArrayList<>();
boolean justInsertedNewline = false; // to prevent contiguous newlines
for (Word w : in) {
String ws = w.word();
if (ws.startsWith("<") && ws.endsWith(">")) {
if (markLineBreaks && !justInsertedNewline) {
// finds start and end of tag name (ignores brackets and /)
// e.g. <p>, <br/>, or </table>
// se s e s e
int tagStartIndex = 1;
while (tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex))) {
tagStartIndex++;
}
if (tagStartIndex == ws.length()) {
continue; // no tag text
}
int tagEndIndex = ws.length() - 1;
while (tagEndIndex > tagStartIndex && !Character.isLetterOrDigit(ws.charAt(tagEndIndex))) {
tagEndIndex--;
}
// looks up tag name in list of known block-level tags
String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase();
if (blockTags.contains(tagName)) {
out.add(new Word("\n")); // mark newline for block-level tags
justInsertedNewline = true;
}
}
} else {
out.add(w); // normal word
justInsertedNewline = false;
}
}
return out;
}
/**
* For internal debugging purposes only.
*/
public static void main(String[] args) {
new BasicDocument<String>();
Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
System.out.println("Before:");
System.out.println(htmlDoc);
Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
System.out.println("After:");
System.out.println(txtDoc);
Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
System.out.println("Sentences:");
System.out.println(sentences);
}
}