/** * */ package fna.parsing; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.Statement; import java.util.Enumeration; import java.util.Hashtable; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import fna.parsing.character.Glossary; /** * @author hongcui * Move the dyhypen() function from VolumeDehypenizer, to make DeHyphenAFolder a utility class that can be called by other projects. */ public class DeHyphenAFolder { private ProcessListener listener; private String database; private VolumeDehyphenizer vd; //private String dataPrefix; private String tablename; private Glossary glossary; private File folder; private File outfolder; private static final Logger LOGGER = Logger.getLogger(DeHyphenAFolder.class); private Connection conn; //static public String num = "\\d[^a-z]+"; private Hashtable<String,String> mapping = new Hashtable<String, String>(); private String glossarytable; /** * */ public DeHyphenAFolder(ProcessListener listener, String workdir, String todofoldername, String database, VolumeDehyphenizer vd, String dataPrefix, String glossarytable, Glossary glossary) { this.listener = listener; this.glossarytable = glossarytable; this.database = database; this.vd = vd; //this.dataPrefix = dataPrefix; this.tablename = dataPrefix+"_"+ApplicationUtilities.getProperty("ALLWORDS"); this.glossary = glossary; workdir = workdir.endsWith("/")? workdir : workdir+"/"; this.folder = new File(workdir+todofoldername); this.outfolder = new File(workdir+ApplicationUtilities.getProperty("DEHYPHENED")); if(!outfolder.exists()){ outfolder.mkdir(); } try{ if(conn == null){ Class.forName(ApplicationUtilities.getProperty("database.driverPath")); String URL = ApplicationUtilities.getProperty("database.url"); conn = DriverManager.getConnection(URL); //createNumTextMixTable(); createAllWordsTable(); } }catch(Exception e){ LOGGER.error("Database is down! (VolumeDehyphenizer)", e); e.printStackTrace(); } } public boolean dehyphen(){ if(listener!= null) listener.progress(1); vd.showPerlMessage("Checking files...\n"); if(hasProblems()){ vd.showPerlMessage(""); vd.showPerlMessage("Files with problems are listed above. \n"); vd.showPerlMessage("Run this step again after the above identified problems are corrected.\n"); listener.progress(0); return false; }else{ vd.showPerlMessage("File checking completed. \n"); listener.progress(5); vd.showPerlMessage("Pre-processing files... \n"); fillInWords(); if(listener!= null) listener.progress(50); DeHyphenizer dh = new DeHyphenizerCorrected(this.database, this.tablename, "word", "count", "-", this.glossarytable, glossary); try{ Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select word from "+tablename+" where word like '%-%'"); while(rs.next()){ String word = rs.getString("word"); String dhword = dh.normalFormat(word).replaceAll("-", "_"); //so dhwords in _allwords table are comparable to words in _wordpos and other tables. Statement stmt1 = conn.createStatement(); stmt1.execute("update "+tablename+" set dhword ='"+dhword+"' where word='"+word+"'"); mapping.put(word, dhword); } stmt.execute("update "+tablename+" set dhword=word where dhword is null"); }catch(Exception e){ LOGGER.error("Problem in VolumeDehyphenizer:dehyphen", e); e.printStackTrace(); } normalizeDocument(); if(listener!= null) listener.progress(100); return true; } } /** * * @return pass the check or not */ private boolean hasProblems() { boolean has = false; int problemcount = 0; try { File[] flist = folder.listFiles();//description folder for(int i= 0; i < flist.length; i++){ BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); //check for unmatched brackets if(hasUnmatchedBrackets(text)){ has = true; vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" contains unmatched brackets in \""+text+"\"\n"); } //check for missing spaces between text and numbers: if(text.matches(".*[a-zA-Z]\\d.*") || text.matches(".*\\d[a-zA-Z].*")){ //has =true; //ant descriptions contain "Mf4" //vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" misses a space between a word and a number in \""+text+"\"\n"); } //check for (?) if(text.matches(".*?\\(\\s*\\?\\s*\\).*")){ has =true; vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" contains expression (?) in \""+text+"\"\n"); vd.showPerlMessage("Change (?) to an text expression such as (not certain)"); } } File cfolder = new File(folder.getParentFile(), "characters"); flist = cfolder.listFiles();//description folder for(int i= 0; i < flist.length; i++){ BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); //check for unmatched brackets if(hasUnmatchedBrackets(text)){ has = true; vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" contains unmatched brackets in \""+text+"\"\n"); } //check for missing spaces between text and numbers: if(text.matches(".*[a-zA-Z]\\d.*") || text.matches(".*\\d[a-zA-Z].*")){ //has =true; //ant descriptions contain "Mf4" //vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" misses a space between a word and a number in \""+text+"\"\n"); } //check for (?) if(text.matches(".*?\\(\\s*\\?\\s*\\).*")){ has =true; vd.showPerlMessage((++problemcount)+": "+flist[i].getAbsolutePath()+" contains expression (?) in \""+text+"\"\n"); vd.showPerlMessage("Change (?) to an text expression such as (not certain)"); } } }catch(Exception e){ LOGGER.error("Problem in VolumeDehyphenizer:check4UnmatchedBrackets", e); e.printStackTrace(); } return has; } private void createAllWordsTable(){ try{ Statement stmt = conn.createStatement(); stmt.execute("drop table if exists "+tablename); String query = "create table if not exists "+tablename+" (word varchar(150) unique not null primary key, count int, dhword varchar(150), inbrackets int default 0)"; stmt.execute(query); }catch(Exception e){ LOGGER.error("Problem in VolumeDehyphenizer:createWordTable", e); e.printStackTrace(); } } /*private void createNumTextMixTable(){ try{ Statement stmt = conn.createStatement(); String query = "create table if not exists "+tablename1+" (id int not null auto_increment primary key, mix varchar(30), file varchar(400))"; stmt.execute(query); stmt.execute("delete from "+tablename1); }catch(Exception e){ e.printStackTrace(); } }*/ /** * check for unmatched brackets too. */ private void fillInWords(){ try { Statement stmt = conn.createStatement(); ResultSet rs = null; File[] flist = folder.listFiles(); int total = flist.length; for(int i= 0; i < flist.length; i++){ BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); text = text.toLowerCase(); text = text.replaceAll("<[^<]+?>", " "); text = text.replaceAll("\\d", " "); text = text.replaceAll("\\(", " ( "); text = text.replaceAll("\\)", " ) "); text = text.replaceAll("\\[", " [ "); text = text.replaceAll("\\]", " ] "); text = text.replaceAll("\\{", " { "); text = text.replaceAll("\\}", " } "); text = text.replaceAll("\\s+", " ").trim(); String[] words = text.split("\\s+"); int lround = 0; int lsquare = 0; int lcurly = 0; int inbracket = 0; for(int j = 0; j < words.length; j++){ String w = words[j].trim(); if(w.compareTo("(")==0) lround++; else if(w.compareTo(")")==0) lround--; else if(w.compareTo("[")==0) lsquare++; else if(w.compareTo("]")==0) lsquare--; else if(w.compareTo("{")==0) lcurly++; else if(w.compareTo("}")==0) lcurly--; else{ w = w.replaceAll("[^-a-z]", " ").trim(); if(w.matches(".*?\\w.*")){ if(lround+lsquare+lcurly > 0){ inbracket = 1; }else{ inbracket = 0; } int count = 1; rs = stmt.executeQuery("select word, count, inbrackets from "+tablename+" where word='"+w+"'"); if(rs.next()){ //normal word exist count += rs.getInt("count"); inbracket *= rs.getInt("inbrackets"); } stmt.execute("delete from "+tablename+" where word ='"+w+"'"); stmt.execute("insert into "+tablename+" (word, count, inbrackets) values('"+w+"', "+count+","+inbracket+")"); } } } listener.progress(5+i*45/total); /*while ((line = reader.readLine()) != null) { line = line.toLowerCase(); line = line.replaceAll("<[^<]+?>", " "); //for xml or html docs line = line.replaceAll(num, " "); line = line.replaceAll("[^-a-z]", " "); line = normalize(line); Statement stmt = conn.createStatement(); ResultSet rs = null; String[] words = line.split("\\s+"); for(int j = 0; j < words.length; j++){ String w = words[j].trim(); if(w.matches(".*?\\w.*")){ int count = 1; rs = stmt.executeQuery("select word, count from "+tablename+" where word='"+w+"'"); if(rs.next()){ count = rs.getInt("count")+1; } stmt.execute("delete from "+tablename+" where word ='"+w+"'"); stmt.execute("insert into "+tablename+" (word, count) values('"+w+"', "+count+")"); } } rs.close(); stmt.close(); }*/ } rs.close(); stmt.close(); } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:fillInWords", e); e.printStackTrace(); } } private boolean hasUnmatchedBrackets(String text) { String[] lbrackets = new String[]{"\\[", "(", "{"}; String[] rbrackets = new String[]{"\\]", ")", "}"}; for(int i = 0; i<lbrackets.length; i++){ int left1 = text.replaceAll("[^"+lbrackets[i]+"]", "").length(); int right1 = text.replaceAll("[^"+rbrackets[i]+"]", "").length(); if(left1!=right1) return true; } return false; } /** * save original text mix in File source in a table, * to be used in outputting final text * @param mix * @param source * @return */ /*private String fixNumTextMix(String mix, File source){ StringBuffer fixed = new StringBuffer(); Pattern p = Pattern.compile("(.*?)(\\d+-)([a-z].*)"); Matcher m = p.matcher(mix); while(m.matches()){ fixed.append(m.group(1)).append("NUM-"); String save = m.group(2)+m.group(3); save = save.substring(0, save.length() < mixlength ? save.length() : mixlength ); //save to table mix = m.group(3); try{ Statement stmt = conn.createStatement(); stmt.execute("insert into "+tablename1+" (mix, file) values ('"+save+"', '"+source.getName()+"')"); }catch (Exception e){ e.printStackTrace(); } } fixed.append(mix); return fixed.toString(); }*/ @SuppressWarnings("unused") private String fixBrokenHyphens(String broken){ //cup-[,] disc-[,] or dish-shaped StringBuffer fixed = new StringBuffer(); Pattern p = Pattern.compile("(.*?\\b)([a-z]+)-\\W[^\\.]*?[a-z]+-([a-z]+)(.*)"); Matcher m = p.matcher(broken); while(m.matches()){ String begin = m.group(1); String part = broken.substring(m.start(2), m.start(3)); broken = m.group(4); String fix = m.group(3); part = part.replaceAll("-(?!\\w)", "-"+fix); fixed.append(begin+part); m = p.matcher(broken); } fixed.append(broken); return fixed.toString(); } private void normalizeDocument(){ try { File[] flist = folder.listFiles(); for(int i= 0; i < flist.length; i++){ BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; //DO NOT normalize case StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); text = performMapping(text); //turn "." that are in brackets as [.DOT.] for unsupervised learning pl. text = text.replaceAll("\\(", " ( "); text = text.replaceAll("\\)", " ) "); text = text.replaceAll("\\[", " [ "); text = text.replaceAll("\\]", " ] "); text = text.replaceAll("\\{", " { "); text = text.replaceAll("\\}", " } "); text = text.replaceAll("\\s+", " ").trim(); int lround = 0; int lsquare = 0; int lcurly = 0; sb = new StringBuffer(); String[] words = text.split("\\s+"); for(int j = 0; j < words.length; j++){ String w = words[j].trim(); if(w.compareTo("(")==0){ lround++; sb.append("("); }else if(w.compareTo(")")==0){ lround--; sb.append(")"); }else if(w.compareTo("[")==0){ lsquare++; sb.append("["); }else if(w.compareTo("]")==0){ lsquare--; sb.append("]"); }else if(w.compareTo("{")==0){ lcurly++; sb.append("{"); }else if(w.compareTo("}")==0){ lcurly--; sb.append("}"); }else{ if(w.matches(".*?[.?;:!].*?") && (lround+lsquare+lcurly)>0){ w = w.replaceAll("\\.", "[DOT]"); w = w.replaceAll("\\?", "[QST]"); w = w.replaceAll(";", "[SQL]"); w = w.replaceAll(":", "[QLN]"); w = w.replaceAll("!", "[EXM]"); } sb.append(w+" "); } } text = sb.toString().replaceAll("\\s*\\(\\s*", "(").replaceAll("\\s*\\)\\s*", ")") .replaceAll("(?<=[^0-9+�-])\\(", " (").replaceAll("\\)(?=[a-z])", ") ").trim(); //write back File outf = new File(outfolder, flist[i].getName()); //BufferedWriter out = new BufferedWriter(new FileWriter(flist[i])); BufferedWriter out = new BufferedWriter(new FileWriter(outf)); out.write(text); out.close(); //System.out.println(flist[i].getName()+" dehyphenized"); vd.showPerlMessage(flist[i].getName()+" dehyphenized\n"); } } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:normalizeDocument", e); e.printStackTrace(); } } /* private String normalize(String text){ text = text.replaceAll("-+", "-"); Pattern p = Pattern.compile("(.*?\\W)-(.*)"); //remove proceeding - Matcher m = p.matcher(text); while(m.matches()){ text = m.group(1)+" "+m.group(2); m = p.matcher(text); } p = Pattern.compile("(.*?)-(\\W.*)"); //remove trailing m = p.matcher(text); while(m.matches()){ text = m.group(1)+" "+m.group(2); m = p.matcher(text); } //text = text.replaceAll("\\W-", " "); //text = text.replaceAll("-\\W", " "); return text; }*/ private String performMapping(String original){ Enumeration<String> en = mapping.keys(); while(en.hasMoreElements()){ String hword = (String)en.nextElement(); String dhword = (String)mapping.get(hword); //System.out.println("hword: "+hword +" dhword: "+dhword); if(!hword.equals(dhword) && !hword.startsWith("-") && !hword.endsWith("-")){ //replace those in lower cases original = original.replaceAll(hword, dhword); //hyphen those phrases that are hyphened once String dhw = dhword.replaceAll("-", " "); //cup-shaped => cup shaped original = original.replaceAll(dhw, dhword); //cup shaped =>cup-shaped //upper cases hword = hword.toUpperCase().substring(0,1)+hword.substring(1); dhword = dhword.toUpperCase().substring(0,1)+dhword.substring(1); original = original.replaceAll(hword, dhword); dhw = dhword.replaceAll("-", " "); //Cup-shaped => Cup shaped original = original.replaceAll(dhw, dhword); //Cup shaped =>Cup-shaped } } return original; } /** * @param args */ public static void main(String[] args) { DeHyphenAFolder dhaf = new DeHyphenAFolder(null, "C:\\RA\\PARSER-DEMO\\Treatise\\target\\", "descriptions", "markedupdatasets", null, "treatise", "treatisehglossaryfixed", null); dhaf.dehyphen(); } }