package com.maalaang.omtwitter.io;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import org.apache.log4j.Logger;
import com.maalaang.omtwitter.model.OMTweet;
import com.maalaang.omtwitter.model.OMTweet_Impl;
public class OMTwitterCorpusFileReader implements OMTwitterReader {
private BufferedReader br = null;
private String fieldDelimiter = null;
private int[] fields = null;
private Logger logger = null;
public OMTwitterCorpusFileReader(String file) throws UnsupportedEncodingException, FileNotFoundException {
this(file, OMTwitterCorpusFile.DEFAULT_FIELD_DELIM, OMTwitterCorpusFile.DEFAULT_FIELDS);
}
public OMTwitterCorpusFileReader(String file, int[] fields) throws UnsupportedEncodingException, FileNotFoundException {
this(file, OMTwitterCorpusFile.DEFAULT_FIELD_DELIM, fields);
}
public OMTwitterCorpusFileReader(String file, String fieldDelimiter) throws UnsupportedEncodingException, FileNotFoundException {
this(file, fieldDelimiter, OMTwitterCorpusFile.DEFAULT_FIELDS);
}
public OMTwitterCorpusFileReader(String file, String fieldDelimiter, int[] fields) throws UnsupportedEncodingException, FileNotFoundException {
this.br = new BufferedReader(new InputStreamReader(new FileInputStream(file), OMTwitterCorpusFile.FILE_CHARSET));
this.fields = fields;
this.fieldDelimiter = fieldDelimiter;
this.logger = Logger.getLogger(this.getClass());
logger.info("read from twitter corpus file - " + file);
}
public boolean hasNext() {
try {
return br.ready();
} catch (IOException e) {
logger.error(e);
return false;
}
}
public OMTweet next() {
String line = null;
try {
line = br.readLine();
} catch (IOException e) {
logger.error(e);
return null;
}
if (line == null) {
logger.info("reached to the end of the corpus file");
return null;
}
OMTweet_Impl tweet = new OMTweet_Impl();
String[] tokens = line.split(fieldDelimiter);
for (int i = 0; i < tokens.length && i < fields.length; i++) {
if (tokens[i].equalsIgnoreCase(OMTwitterCorpusFile.FIELD_EMPTY_STR)) {
tokens[i] = null;
}
switch (fields[i]) {
case OMTwitterCorpusFile.FIELD_IGNORE:
break;
case OMTwitterCorpusFile.FIELD_ID:
tweet.setId(tokens[i]);
break;
case OMTwitterCorpusFile.FIELD_AUTHOR:
tweet.setAuthor(tokens[i]);
break;
case OMTwitterCorpusFile.FIELD_TEXT:
tweet.setText(tokens[i]);
break;
case OMTwitterCorpusFile.FIELD_DATE:
tweet.setDate(tokens[i]);
break;
case OMTwitterCorpusFile.FIELD_POLARITY:
tweet.setPolarity(tokens[i]);
break;
case OMTwitterCorpusFile.FIELD_QUERY:
tweet.setQuery(tokens[i]);
break;
default:
throw new IllegalStateException();
}
}
logger.debug("read a tweet: " + tweet);
return tweet;
}
public void close() {
try {
logger.info("close twitter corpus file");
br.close();
} catch (IOException e) {
logger.error(e);
}
br = null;
}
public static int fieldNameToId(String fieldName) {
for (int i = 0; i < OMTwitterCorpusFile.FIELD_NAMES.length; i++) {
if (OMTwitterCorpusFile.FIELD_NAMES[i].equals(fieldName)) {
return i;
}
}
throw new IllegalArgumentException();
}
}