package fna.parsing;
//
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.sql.ResultSet;
import java.util.Hashtable;
import org.apache.log4j.Logger;
import fna.parsing.character.Glossary;
/**
* run through a column in a database, remove hyphens that separate one word
* @author hongcui
*
*/
public class DeHyphenizer {
private String tablename;
private String columnname;
private String countcolumn;
private String hyphen;
static private Connection conn = null;
//static private String username = ApplicationUtilities.getProperty("database.username");
//static private String password = ApplicationUtilities.getProperty("database.password");
private static final Logger LOGGER = Logger.getLogger(DeHyphenizer.class);
private String glossTable = null;
private Glossary glossary;
public DeHyphenizer(String database, String table, String column, String countcolumn, String hyphen, String glosstable, Glossary glossary) {
this.glossTable = glosstable;
this.glossary = glossary;
try{
if(conn == null){
Class.forName(ApplicationUtilities.getProperty("database.driverPath"));
String URL = ApplicationUtilities.getProperty("database.url");
conn = DriverManager.getConnection(URL);
}
}catch(Exception e){
e.printStackTrace();
}
this.tablename = table;
this.columnname = column;
this.countcolumn = countcolumn;
this.hyphen = hyphen;
}
public void deHyphen(){
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select "+columnname+" from "+tablename+" where "+columnname+" like '%\\"+hyphen+"%'");
while(rs.next()){
String hyphenedterm = rs.getString(1);
String[] parts = hyphenedterm.split("\\s+");
String term = "";
for(int i = 0; i<parts.length; i++){
term += normalFormat(parts[i])+" ";
}
term = term.trim();
if(term.compareTo(hyphenedterm) != 0){
//System.out.println("\nold term: "+hyphenedterm);
//System.out.println("new term: "+term);
updateTable(hyphenedterm, term);
}
}
}catch(Exception e){
e.printStackTrace();
LOGGER.error("Encountered a problem in DeHyphenizer:deHyphen", e);
}
}
public String normalFormat(String hyphened){
/*if(hyphened.startsWith("NUM-")){
return hyphened;
}*/
hyphened = hyphened.replaceFirst("^_", "");
String[] segs = hyphened.split(this.hyphen);
String[] terms = new String[segs.length];
int[][] matrix = new int[segs.length][segs.length];
//fill matrix
fillMatrix(segs, matrix);
collectTerms(segs, terms, matrix);
//out put term
String term = "";
for(int i =0; i<terms.length; i++){
if(terms[i] != null){
term += terms[i]+"-";
}
}
return term.replaceFirst("-$", "");
}
protected void collectTerms(String[] segs, String[] terms, int[][] matrix) {
//rank rows
int max = 0;
Hashtable<String, String> rank = new Hashtable<String, String>();
for(int i = 0; i < segs.length; i++){
int ones = countOnes(matrix[i]);
if(ones > max){
max = ones;
}
String list = (String)rank.get(ones+"");
if(list == null){
rank.put(ones+"", i+"");
}else{
rank.put(ones+"", list+" "+i+"");
}
}
//collect terms
String checked="-";
for(int i = 0; i <= max; i++){
String list = (String)rank.get(i+"");
if(list!= null && i == 0){//term not see in learned or glossary, and not connectable to other terms
String[] indexes = list.split(" ");
for(int j = 0; j < indexes.length; j++){
int ind = Integer.parseInt(indexes[j]);
terms[ind] = segs[ind];
}
}else if(list!=null){
String[] indexes = list.split(" ");
for(int j = 0; j < indexes.length; j++){
int ind = Integer.parseInt(indexes[j]);
if(checked.indexOf("-"+ind+"-")<0){
int lastk = otherEndIndex(matrix[ind], ind); //last index of 1 in the row
terms[ind] = ind > lastk? formTerm(segs, lastk, ind) : formTerm(segs, ind, lastk);
checked += ind > lastk? formString(lastk, ind, "-") : formString(ind, lastk, "-");
}
}
}
}
}
private void fillMatrix(String[] segs, int[][] matrix) {
for(int i = 0; i < segs.length; i++){
for(int j = i; j<segs.length; j++){
matrix[i][j] = isTerm(segs, i, j);
matrix[j][i] = matrix[i][j];
}
}
}
protected String formString(int start, int end, String connector){
String str = "";
for(int i = start; i<=end; i++){
str +=i+connector;
}
return str;
}
protected String formTerm(String[] segs, int start, int end){
String str="";
for(int i = start; i<=end; i++){
str +=segs[i];
}
return str;
}
private int otherEndIndex(int [] array, int theotherindex){
//boolean self = false;
int index = -1;
for(int i = 0; i< array.length; i++){
if(array[i]==1 && i == theotherindex){
//self = true;
}else if(array[i]==1){//take the greatest index, may cause problem here.
index = i;
}
}
return index==-1? theotherindex : index;
}
private int countOnes(int[] array){
int count = 0;
for(int i = 0; i< array.length; i++){
if(array[i]==1){
count++;
}
}
return count;
}
private int isTerm(String[] segs, int start, int end){
String str=formTerm(segs, start, end);
if(glossary!=null && stringMatchInGloss(str)){
return 1;
}
if(stringMatch(str)){
return 1;
}
return 0;
}
private void updateTable(String oldt, String newt){
try{
Statement stmt = conn.createStatement();
if(countcolumn == null ||countcolumn.trim().compareTo("") == 0){//without count
ResultSet rs = stmt.executeQuery("select "+columnname+" from "+tablename+" where "+columnname+" = '"+newt+"'");
if(!rs.next()){//newt not exist
stmt.execute("update "+tablename+" set "+columnname+" = '"+newt+"' where "+columnname+" = '"+oldt+"'");
//System.out.println("no count, update old with new ");
}else{
stmt.execute("delete from "+tablename+" where "+columnname+" = '"+oldt+"'");
//System.out.println("no count, new exists, remove old");
}
}else{//with count
int total = 0;
boolean newexist = false;
ResultSet rs = stmt.executeQuery("select "+countcolumn+" from "+tablename+" where "+columnname+" = '"+oldt+"'");
if(rs.next()){
total += rs.getInt(1);
}
rs = stmt.executeQuery("select "+countcolumn+" from "+tablename+" where "+columnname+" = '"+newt+"'");
if(rs.next()){
total += rs.getInt(1);
newexist = true;
}
if(newexist){
stmt.execute("delete from "+tablename+" where "+columnname+" = '"+oldt+"'");
stmt.execute("update "+tablename+" set "+countcolumn+" = "+ total+" where "+columnname+" = '"+newt+"'");
//System.out.println("count, new exist, remove old, update count for new: "+total);
}else{
String query = "update "+tablename+" set "+countcolumn+" = "+ total+", "+columnname+" = '"+newt+"' where "+columnname+" = '"+oldt+"'";
stmt.execute(query);
//System.out.println("count, update old with new with count: "+total);
}
}
}catch(Exception e){
e.printStackTrace();
LOGGER.error("Encountered a problem in DeHyphenizer:updateTable", e);
}
}
public boolean stringMatchInGloss(String term){
boolean find = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+this.glossTable+" where term like '% "+term+" %' or term like '"+term+" %' or term like '% "+term+"' or term like '"+term+"'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
find = true;
}
}catch(Exception e){
e.printStackTrace();
LOGGER.error("Encountered a problem in DeHyphenizer:stringMatchInGloss", e);
}
return find;
}
public boolean stringMatch(String term){
boolean find = false;
try{
Statement stmt = conn.createStatement();
String query = "select "+this.columnname+" from "+tablename+" where "+this.columnname+" like '% "+term+" %' or "+this.columnname+" like '"+term+" %' or "+this.columnname+" like '% "+term+"' or "+this.columnname+" like '"+term+"'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
find = true;
}
}catch(Exception e){
e.printStackTrace();
LOGGER.error("Encountered a problem in DeHyphenizer:stringMatch", e);
}
return find;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
DeHyphenizer dh = new DeHyphenizer("fnav5_corpus", "learnedstates_copy", "state", "count", "_", "fna", null);
dh.deHyphen();
}
}