package jhazm.reader;
import com.infomancers.collections.yield.Yielder;
import jhazm.model.Document;
import jhazm.utility.RegexPattern;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* interfaces Hamshahri Corpus (http://ece.ut.ac.ir/dbrg/hamshahri/files/HAM2/Corpus.zip)
* that you must download and extract it.
*
* @author Mojtaba Khallash
*/
public class HamshahriReader {
private final String[] invalidFiles = new String[] {
"hamshahri.dtd", "HAM2-960622.xml", "HAM2-960630.xml", "HAM2-960701.xml", "HAM2-960709.xml",
"HAM2-960710.xml", "HAM2-960711.xml", "HAM2-960817.xml", "HAM2-960818.xml", "HAM2-960819.xml",
"HAM2-960820.xml", "HAM2-961019.xml", "HAM2-961112.xml", "HAM2-961113.xml", "HAM2-961114.xml",
"HAM2-970414.xml", "HAM2-970415.xml", "HAM2-970612.xml", "HAM2-970614.xml", "HAM2-970710.xml",
"HAM2-970712.xml", "HAM2-970713.xml", "HAM2-970717.xml", "HAM2-970719.xml", "HAM2-980317.xml",
"HAM2-040820.xml", "HAM2-040824.xml", "HAM2-040825.xml", "HAM2-040901.xml", "HAM2-040917.xml",
"HAM2-040918.xml", "HAM2-040920.xml", "HAM2-041025.xml", "HAM2-041026.xml", "HAM2-041027.xml",
"HAM2-041230.xml", "HAM2-041231.xml", "HAM2-050101.xml", "HAM2-050102.xml", "HAM2-050223.xml",
"HAM2-050224.xml", "HAM2-050406.xml", "HAM2-050407.xml", "HAM2-050416.xml"
};
private RegexPattern paragraphPattern;
private String rootFolder;
public HamshahriReader() {
this("resources/corpora/hamshahri");
}
public HamshahriReader(String root) {
this.rootFolder = root;
this.paragraphPattern = new RegexPattern("(\n.{0,50})(?=\n)", "$1\n");
}
public RegexPattern getParagraphPattern() {
return paragraphPattern;
}
public String getRootFolder() {
return rootFolder;
}
public List<String> getInvalidFiles() {
return Arrays.asList(invalidFiles);
}
public Iterable<Document> getDocuments() {
return new YieldDocument();
}
class YieldDocument extends Yielder<Document> {
private final List<String> allFiles;
private int index;
private int docLength;
private int docIndex;
private NodeList DOCs;
public YieldDocument() {
allFiles = fileList(getRootFolder());
index = -1;
docLength = 0;
docIndex = -1;
DOCs = null;
}
private List<String> fileList(String path){
return fileList(new File(path));
}
private List<String> fileList(File dir){
List<String> fs = new ArrayList<>();
File[] files = dir.listFiles();
for (File file : files) {
if (file.isDirectory()) {
return fileList(file);
} else {
fs.add(file.getAbsolutePath());
}
}
return fs;
}
@Override
protected void yieldNextCore() {
boolean isOpen = docLength > 0 && docIndex + 1 < docLength;
File file = null;
if (!isOpen) {
do {
index++;
if (index >= allFiles.size()) {
yieldBreak();
return;
}
file = new File(allFiles.get(index));
} while (getInvalidFiles().contains(file.getName()));
}
try {
if (!isOpen) {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
org.w3c.dom.Document doc = dBuilder.parse(file);
doc.getDocumentElement().normalize();
DOCs = doc.getElementsByTagName("DOC");
docIndex = -1;
docLength = DOCs.getLength();
}
docIndex++;
Element DOC = (Element)DOCs.item(docIndex);
Element DOCID = (Element)DOC.getElementsByTagName("DOCID").item(0);
Element DOCNO = (Element)DOC.getElementsByTagName("DOCNO").item(0);
Element ORIGINALFILE = (Element)DOC.getElementsByTagName("ORIGINALFILE").item(0);
Element ISSUE = (Element)DOC.getElementsByTagName("ISSUE").item(0);
Element TITLE = (Element)DOC.getElementsByTagName("TITLE").item(0);
NodeList DATEs = DOC.getElementsByTagName("DATE");
String WesternDate = "";
String PersianDate = "";
for (int i = 0; i < DATEs.getLength(); i++) {
Element DATE = (Element)DATEs.item(i);
switch (DATE.getAttribute("calender")) {
case "Western":
WesternDate = DATE.getTextContent();
break;
case "Persian":
PersianDate = DATE.getTextContent();
break;
}
}
NodeList CATs = DOC.getElementsByTagName("CAT");
String EnglishCategory = "";
String PersianCategory = "";
for (int i = 0; i < CATs.getLength(); i++) {
Element CAT = (Element)CATs.item(i);
switch (CAT.getAttribute("xml:lang")) {
case "en":
EnglishCategory = CAT.getTextContent();
break;
case "fa":
PersianCategory = CAT.getTextContent();
break;
}
}
// refine text
Element TEXT = (Element)DOC.getElementsByTagName("TEXT").item(0);
String body = TEXT.getTextContent();
body = getParagraphPattern().apply(body).replace("\no ", "\n");
Document document = new Document();
document.setID(DOCID.getTextContent());
document.setNumber(DOCNO.getTextContent());
document.setOriginalFile(ORIGINALFILE.getTextContent());
document.setIssue(ISSUE.getTextContent());
document.setWesternDate(WesternDate);
document.setPersianDate(PersianDate);
document.setEnglishCategory(EnglishCategory);
document.setPersianCategory(PersianCategory);
document.setTitle(TITLE.getTextContent());
document.setBody(body);
yieldReturn(document);
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}