/**
*
*/
package preprocessing;
import java.io.File;
import java.io.FileOutputStream;
import java.io.BufferedOutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.log4j.Logger;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import outputter.search.TermSearcher;
/**
* @author Hong Updates
*This class reads character statements from database,
*output 1 XML file containing character statements
*<treatment><character><description>
*The XML file will be used as Type 3 source file for CharaParser and its transformer is CharacterStatementsTransformer4NativeXML
*/
public class NativeXMLSourceFileCreator {
private File output;
private Connection conn;
private String sourcetable;
private static String username="root";
private static String password="root";
private static String nonEnglish="Bockmann 1998|Di Dario 1999|Shibatta 1998";
private static final Logger LOGGER = Logger.getLogger(NativeXMLSourceFileCreator.class);
/**
* constructor
*/
public NativeXMLSourceFileCreator(String tablename, String outputdir, String database, String tableprefix) {
this.output = new File(outputdir);
if(output.exists()){
File[] all = this.output.listFiles();
for(int i =0; i<all.length; i++){
all[i].delete();
}
}
//the sourcetable must have:source (pdf_charnumber), pdf, charnumber, characterr, sentence columns
this.sourcetable = tableprefix+"_"+tablename;
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password;
conn = DriverManager.getConnection(URL);
}
}catch(Exception e){
LOGGER.error("", e);
}
}
public void outputXMLFile(){
try{
String srcf = output.getName();
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select distinct source from "+this.sourcetable);
Element root = new Element("treatment"); //save all in one file to reduce I/O overhead for subsequent process
while(rs.next()){
String src = rs.getString("source");
if(src.matches("("+NativeXMLSourceFileCreator.nonEnglish+").*")) continue;
Statement stmt1 = conn.createStatement();
String q = "select distinct characterr, sentence from "+this.sourcetable+" where source='"+src+"'";
ResultSet rs1 = stmt1.executeQuery(q);
boolean ch = false;
//StringBuffer sb = new StringBuffer();
int count = 1;
while(rs1.next()){//one character + n sentences
if(!ch){
Element chara = new Element("character");
chara.setAttribute("pid", srcf+".txtp"+src.trim().replaceAll("\\s+", "_")); //this is how ids in each tag should be set: sourcefilename+".txtp"+additionalID
chara.setText(rs1.getString("characterr").trim());
root.addContent(chara);
ch = true;
}
String sent = rs1.getString("sentence").trim();
//sent = sent.matches("[\\.;]$")? sent : sent+";";
Element descr = new Element("description");
descr.setAttribute("pid", srcf+".txtp"+src.trim().replaceAll("\\s+", "_")+"_s"+(count++));
descr.setText(sent);
root.addContent(descr);
}
System.out.println("Write "+src);
}
root.detach();
//output doc
XMLOutputter out = new XMLOutputter(Format.getPrettyFormat());
out.output(new Document(root), new BufferedOutputStream(new FileOutputStream(new File(this.output, output.getName()+".xml"))));
}catch(Exception e){
LOGGER.error("", e);
}
}
/*private void readfile (File f){
try{
FileInputStream istream = new FileInputStream(f);
InputStreamReader inread = new InputStreamReader(istream);
BufferedReader buff = new BufferedReader(inread);
String source = f.getName();
String s="";
String ch = "";
String sent = "";
int charid = 1;
int sentid = 1;
boolean startc = false;
boolean starts = false;
while((s = buff.readLine())!=null){
if(s.trim().length()==0){
ch = "";
startc = true;
starts = false;
}else if(startc){
//read and concat character line
ch +=s+" ";
}
if(s.trim().matches("\\d+:.*")){
startc = false;
starts = true;
insertCharacter(ch.trim(), charid, source);
if(sent.trim().length()>0){
insertSentence(sent.trim(), sentid++, source, charid);
sent = "";
}
charid++;
}
if(starts){
sent +=s+" ";
}
}
}catch(Exception e){
LOGGER.error("", e);
}
}
private void insertSentence(String sentence, int sentid, String source, int charid) {
String clean = sentence.replaceFirst("^\\d+:", "").replaceAll("\\([^)]*\\)", "").trim();
this.text.append(clean.replaceFirst("\\W\\s*$", "")+"; ");
try{
Statement stmt = conn.createStatement();
stmt.execute("insert into "+this.tableprefix+"_sentence (sentid, source, sentence, originalsent, charid) values ("+sentid+",'"+source+"','"+clean+"','"+sentence+"',"+charid+")");
}catch(Exception e){
LOGGER.error("", e);
}
}
private void insertCharacter(String character, int charid, String source) {
try{
Statement stmt = conn.createStatement();
stmt.execute("insert into "+this.tableprefix+"_character (charid, source, characterr) values ("+charid+",'"+source+"','"+character+"')");
}catch(Exception e){
LOGGER.error("", e);
}
}*/
/**
* @param args
*/
public static void main(String[] args) {
//the sourcetable must have:source(pdf_charnumber), pdf, charnumber, characterr, sentence columns
/*String output = "Z:\\DATA\\phenoscape\\subcontract\\core-fish\\fish\\source";
String tableprefix = "fish";*/
/*String output = "Z:\\DATA\\phenoscape\\subcontract\\archosaur\\source";
String tableprefix = "archosaur";
String source = "original";
String database = "phenoscape";*/
// String output = "Z:\\DATA\\phenotype\\source";
// String tableprefix = "phenotype";
String output = "C:\\Users\\Zilong Chang\\Documents\\WORK\\amphibia\\source";
String tableprefix = "pheno_amphibia";
String source = "original";
String database = "phenoscape";
NativeXMLSourceFileCreator sfc = new NativeXMLSourceFileCreator(source, output, database, tableprefix);
sfc.outputXMLFile();
}
}