/**
*
*/
package preprocessing;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import org.apache.log4j.Logger;
import outputter.search.TermSearcher;
/**
* @author Hong Updates
*This class reads character statements from text files, striping off the numbering, and
*saving the sentences in a table.
*The characters are saved in another table
*/
public class CharacterDescriptionReader {
private File source;
private File output;
private Connection conn;
private String tableprefix;
private static String username="phenoscape";
private static String password="pheno!scape";
private static String statementtable = "sentence";
private static String charactertable = "character";
private static StringBuffer text = new StringBuffer();
private static final Logger LOGGER = Logger.getLogger(CharacterDescriptionReader.class);
/**
* constructor
*/
public CharacterDescriptionReader(String source, String output, String database, String tableprefix) {
this.source = new File(source);
this.output = new File(output);
this.tableprefix = tableprefix;
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password;
conn = DriverManager.getConnection(URL);
Statement stmt = conn.createStatement();
stmt.execute("drop table if exists "+tableprefix+"_"+statementtable);
stmt.execute("create table if not exists "+tableprefix+"_"+statementtable+" (sentid int(11) not null unique, source varchar(500), sentence text, originalsent text, lead varchar(2000), status varchar(20), tag varchar(500),modifier varchar(150), charid int(11), primary key (sentid)) engine=innodb");
stmt.execute("drop table if exists "+tableprefix+"_"+charactertable);
stmt.execute("create table if not exists "+tableprefix+"_"+charactertable+" (charid int(11) not null unique, source varchar(500), characterr varchar(500), primary key (charid)) engine=innodb");
stmt.close();
}
}catch(Exception e){
LOGGER.error("", e);
}
}
/**
* blank-line separated descriptions, each contains 1 character and N numbered-character statement
* Ex:
93. Development of gas bladder (ORDERED). (CI =
0.333, RI = 0.879)
0: Gas bladder not reduced, with large anterior and poste-
rior chambers.
1: Gas bladder somewhat reduced, with large anterior
chamber and small posterior chambers. (Chiloglanis sp.
�kalungwishi�; Chiloglanis sp. �burundi�; Atopochilus;
Euchilichthys)
2: Gas bladder greatly reduced, with small anterior chamber
only. (Amphiliidae; all Chiloglanis except C. macropterus
[0], Chiloglanis sp. �kalungwishi� [1] and Chiloglanis sp.
�burundi� [1]; Atopodontus)
*/
public void read(){
File[] files = source.listFiles();
for(int i = 0; i<files.length; i++){
readfile(files[i]);
}
try{
FileWriter wrt = new FileWriter(output);
wrt.append(CharacterDescriptionReader.text.toString());
wrt.flush();
wrt.close();
}catch(Exception e){
LOGGER.error("", e);
}
}
private void readfile (File f){
try{
FileInputStream istream = new FileInputStream(f);
InputStreamReader inread = new InputStreamReader(istream);
BufferedReader buff = new BufferedReader(inread);
String source = f.getName();
String s="";
String ch = "";
String sent = "";
int charid = 1;
int sentid = 1;
boolean startc = false;
boolean starts = false;
while((s = buff.readLine())!=null){
if(s.trim().length()==0){
ch = "";
startc = true;
starts = false;
}else if(startc){
//read and concat character line
ch +=s+" ";
}
if(s.trim().matches("\\d+:.*")){
startc = false;
starts = true;
insertCharacter(ch.trim(), charid, source);
if(sent.trim().length()>0){
insertSentence(sent.trim(), sentid++, source, charid);
sent = "";
}
charid++;
}
if(starts){
sent +=s+" ";
}
}
}catch(Exception e){
LOGGER.error("", e);
}
}
private void insertSentence(String sentence, int sentid, String source, int charid) {
String clean = sentence.replaceFirst("^\\d+:", "").replaceAll("\\([^)]*\\)", "").trim();
CharacterDescriptionReader.text.append(clean.replaceFirst("\\W\\s*$", "")+"; ");
try{
Statement stmt = conn.createStatement();
stmt.execute("insert into "+this.tableprefix+"_sentence (sentid, source, sentence, originalsent, charid) values ("+sentid+",'"+source+"','"+clean+"','"+sentence+"',"+charid+")");
}catch(Exception e){
LOGGER.error("", e);
}
}
private void insertCharacter(String character, int charid, String source) {
try{
Statement stmt = conn.createStatement();
stmt.execute("insert into "+this.tableprefix+"_character (charid, source, characterr) values ("+charid+",'"+source+"','"+character+"')");
}catch(Exception e){
LOGGER.error("", e);
}
}
/**
* @param args
*/
public static void main(String[] args) {
String source = "Z:\\DATA\\phenoscape\\text";
String output = "Z:\\DATA\\phenoscape\\descriptions\\vigliotta_2008.txt";
String database = "phenoscape";
String tableprefix = "test";
CharacterDescriptionReader cdr = new CharacterDescriptionReader(source, output, database, tableprefix);
cdr.read();
}
}