package cc.mallet.pipe;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
public class CharSequenceRemoveUUEncodedBlocks extends Pipe {
/**
Given a string, remove lines that begin with M and are 61 characters long.
Note that there are some UUEncoded blocks that do not match this.
I have seen some that are 64 characters long, and have no regular prefix character,
but this filter gets most of them in 20 Newsgroups.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
public static final Pattern UU_ENCODED_LINE= Pattern.compile ("^M.{60}$");
public CharSequenceRemoveUUEncodedBlocks ()
{
}
public Instance pipe (Instance carrier)
{
String string = ((CharSequence)carrier.getData()).toString();
Matcher m = UU_ENCODED_LINE.matcher(string);
carrier.setData(m.replaceAll (""));
return carrier;
}
//Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
@SuppressWarnings("unused")
int version = in.readInt ();
}
}