package com.maalaang.omtwitter.ml;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import com.maalaang.omtwitter.text.OMTweetToken;
import com.maalaang.omtwitter.text.OMTweetTokenizer;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
/**
* @author Sangwon Park
*/
public class TweetFeatures extends Pipe {
private static final long serialVersionUID = 2396028381230L;
private static final int SERIAL_VERSION = 58284;
@Override
public Instance pipe(Instance carrier) {
TokenSequence ts = (TokenSequence) carrier.getData();
for (int i = 0; i < ts.size(); i++) {
Token t = ts.get(i);
OMTweetToken omtToken = OMTweetTokenizer.omtToken(t.getText());
String text = omtToken.getNormalizedText();
switch(omtToken.getType()) {
case OMTweetToken.TOKEN_TYPE_HASHTAG:
t.setFeatureValue("HASHTAG", 1.0);
t.setFeatureValue(text, 1.0);
break;
case OMTweetToken.TOKEN_TYPE_USER:
t.setFeatureValue("USER", 1.0);
break;
case OMTweetToken.TOKEN_TYPE_URL:
t.setFeatureValue("LINK", 1.0);
break;
case OMTweetToken.TOKEN_TYPE_NORMAL:
t.setFeatureValue(text, 1.0);
break;
}
}
return carrier;
}
private void writeObject(ObjectOutputStream out) throws IOException {
out.defaultWriteObject();
out.writeInt(SERIAL_VERSION);
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
in.defaultReadObject();
in.readInt();
}
}