/* Copyright (C) 2006 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package edu.nd.nina.graph.load;
import org.jsoup.Jsoup;
import edu.nd.nina.types.Instance;
/**
* This pipe removes HTML from a CharSequence. The HTML is actually parsed here,
* so we should have less HTML slipping through... but it is almost certainly
* much slower than a regular expression, and could fail on broken HTML.
*
* @author Greg Druck <a
* href="mailto:gdruck@cs.umass.edu">gdruck@cs.umass.edu</a>
*/
public class CharSequenceRemoveHTML extends Pipe {
public Instance pipe(Instance carrier) {
String text = ((CharSequence) carrier.getData()).toString();
String result = Jsoup.parse(text).text();
result = result.toLowerCase();
carrier.setData((CharSequence) result);
return carrier;
}
}