package jhazm.reader;
import com.infomancers.collections.yield.Yielder;
import jhazm.model.Doc;
import org.apache.commons.lang3.StringUtils;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
/**
* interfaces [Persica Corpus](https://sourceforge.net/projects/persica/)
*
* Created by Mojtaba on 30/10/2015.
*/
public class PersicaReader {
//
// Fields
//
private String persicaFile;
//
// Constructors
//
public PersicaReader() {
this("resources/corpora/persica.csv");
}
public PersicaReader(String persicaFile) {
this.persicaFile = persicaFile;
}
//
// API
//
public Iterable<Doc> getDocs() { return new YieldDoc(); }
public Iterable<String> getTexts() { return new YieldtText(); }
//
// Helper
//
private String getPersicaFile() {
return persicaFile;
}
class YieldDoc extends Yielder<Doc> {
private BufferedReader br;
public YieldDoc() {
try {
FileInputStream fstream = new FileInputStream(getPersicaFile());
DataInputStream in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF8")));
}
catch (Exception ex) {
ex.printStackTrace();
}
}
@Override
protected void yieldNextCore() {
try {
List<String> lines = new ArrayList<>();
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
if (line.endsWith(",")) {
lines.add(StringUtils.stripEnd(line, ","));
}
else {
lines.add(line);
yieldReturn(new Doc(
Integer.parseInt(lines.get(0)), // ID
lines.get(1), // Title
lines.get(2), // Text
lines.get(3), // Date
lines.get(4), // Time
lines.get(5), // Category
lines.get(6))); // Category2
lines = new ArrayList<>();
return;
}
}
}
br.close();
} catch(Exception ex){
ex.printStackTrace();
}
}
}
class YieldtText extends Yielder<String> {
private Iterator<Doc> iter;
public YieldtText() {
try {
iter = getDocs().iterator();
}
catch (Exception ex) {
ex.printStackTrace();
}
}
@Override
protected void yieldNextCore() {
try {
while (iter.hasNext()) {
Doc doc = iter.next();
yieldReturn(doc.Text);
return;
}
} catch(Exception ex){
ex.printStackTrace();
}
}
}
}