package com.maalaang.omtwitter.ml;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Logger;
import cc.mallet.types.Instance;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader;
import com.maalaang.omtwitter.model.OMTweet;
public class TweetEntityCorpusLineIterator implements Iterator<Instance> {
private OMTwitterCorpusFileReader reader = null;
private Logger logger = null;
private long cnt = 0;
private ArrayList<String> tokenList = null;
private ArrayList<String> posTagList = null;
private ArrayList<String> labelList = null;
public TweetEntityCorpusLineIterator(String file, String fieldDelim, int[] fields) throws ClassNotFoundException, IOException {
try {
reader = new OMTwitterCorpusFileReader(file, fieldDelim, fields);
} catch (IOException e) {
}
logger = Logger.getLogger(this.getClass().getName());
tokenList = new ArrayList<String>();
labelList = new ArrayList<String>();
posTagList = new ArrayList<String>();
}
public boolean hasNext() {
return reader.hasNext();
}
public Instance next() {
OMTweet tweet = reader.next();
String[] tokens = tweet.getText().split("\\s+");
String prev = null;
tokenList.clear();
labelList.clear();
posTagList.clear();
StringBuilder sb = null;
for (String t : tokens) {
int idx = t.lastIndexOf('/');
if (idx >= 0) {
int idx1 = t.lastIndexOf('/', idx-1);
String word = t.substring(0, idx1);
String posTag = t.substring(idx1 + 1, idx);
String label = t.substring(idx + 1);
if (prev != null) {
word = prev + " " + word;
prev = null;
}
tokenList.add(word);
labelList.add(label);
posTagList.add(posTag);
if (sb == null) {
sb = new StringBuilder();
sb.append(word);
} else {
sb.append(' ');
sb.append(word);
}
} else {
if (prev == null) {
prev = t;
} else {
prev += " " + t;
}
}
}
String[][] data = new String[3][];
data[0] = tokenList.toArray(new String[0]);
data[1] = posTagList.toArray(new String[0]);
data[2] = labelList.toArray(new String[0]);
Instance inst = new Instance(data, null, null, null);
cnt++;
if (cnt % 100000 == 0) {
logger.info(cnt + " labeled tweets were read");
}
return inst;
}
public void remove() {
}
}