package fna.parsing; import java.util.ArrayList; import org.apache.log4j.Logger; import org.eclipse.swt.SWT; import org.eclipse.swt.widgets.Display; import org.eclipse.swt.widgets.Table; import org.eclipse.swt.widgets.TableItem; import org.eclipse.swt.widgets.Text; import fna.db.VolumeMarkupDbAccessor; import fna.parsing.character.Glossary; //import fna.parsing.finalizer.Output; /** * normalize hyphens in the document folder. may be plain text or html/xml docs. for the latter, tags are ignored in dehyphenization process. * run this before VolumeMarkup. * @author hongcui * */ @SuppressWarnings({ "unused","static-access" }) public class VolumeDehyphenizer extends Thread { //protected File folder = null; //protected File outfolder = null; //protected Connection conn = null; //protected final String username = ApplicationUtilities.getProperty("database.username"); //protected final String password = ApplicationUtilities.getProperty("database.password"); //protected String tablename = null; //private final String tablename1= "numtextmix"; //private final int mixlength = 30; //static public String num = "\\d[^a-z]+"; protected String database = ""; //protected Hashtable<String,String> mapping = new Hashtable<String, String>(); protected ProcessListener listener; private static final Logger LOGGER = Logger.getLogger(VolumeDehyphenizer.class); private Glossary glossary = null; // TODO private Display display; private Text perlLog; private String dataPrefix; private DeHyphenAFolder dhf; private Table descriptorTable; private MainForm mainForm; private VolumeMarkupDbAccessor vmdb; private String glossaryTableName; public VolumeDehyphenizer(ProcessListener listener, String workdir, String todofoldername, String database, Display display, Text perlLog, String dataPrefix, MainForm mainForm) { this.listener = listener; /** Synchronizing UI and background process **/ this.display = display; this.perlLog = perlLog; this.dataPrefix = dataPrefix; this.mainForm = mainForm; this.glossaryTableName = mainForm.glossaryPrefixCombo.getText(); this.vmdb = new VolumeMarkupDbAccessor(dataPrefix, this.glossaryTableName); this.glossary = new Glossary(this.glossaryTableName); //dehypen step is not needed for NeXML files //this.dhf = new DeHyphenAFolder(listener,workdir,todofoldername, database, this, dataPrefix, this.glossaryTableName, glossary); } public void run () { listener.setProgressBarVisible(true); //boolean done = dhf.dehyphen();//dhf waits for all unmatched brackets are fixed. //if(done){ VolumeMarkup vm = new VolumeMarkup(listener, display, perlLog, dataPrefix, this.glossaryTableName); resetPerlMessage(); //clean up perlLog box vm.markup(); listener.setProgressBarVisible(false); //} } /*private void loadOthersTab() { display.syncExec(new Runnable() { public void run() { mainForm.showOtherTerms(); } }); }*/ /* private void loadDescriptorTab() { display.syncExec(new Runnable() { public void run() { ArrayList <String> words = null; try { words = vmdb.descriptorTerms4Curation(); } catch (Exception exe){ LOGGER.error("unable to load descriptor tab in Markup : MainForm", exe); exe.printStackTrace(); } int count = 1; descriptorTable.removeAll(); //clean up before a load if (words != null) { for (String word : words){ TableItem item = new TableItem(descriptorTable, SWT.NONE); item.setText(new String [] {count+"", word}); count++; } } }}); }*/ public void resetPerlMessage() { display.syncExec(new Runnable() { public void run() { perlLog.setText(""); } }); } public void showPerlMessage(final String message) { display.syncExec(new Runnable() { public void run() { perlLog.append(message); } }); } //moved the following to DeHyphenAFolder.java /*public void incrementProgressBar(int progress) { listener.progress(progress); }*/ /* public void dehyphen(){ System.out.println("Preparing files..."); showPerlMessage("Preparing files..."); incrementProgressBar(1); fillInWords(); incrementProgressBar(50); DeHyphenizer dh = new DeHyphenizerCorrected(this.database, this.tablename, "word", "count", "-", dataPrefix); try{ Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select word from "+tablename+" where word like '%-%'"); while(rs.next()){ String word = rs.getString("word"); String dhword = dh.normalFormat(word); //System.out.println(word+"===>"+dhword); //MainForm.markUpPerlLog.append(word+"===>"+dhword+"\n"); mapping.put(word, dhword); } }catch(Exception e){ LOGGER.error("Problem in VolumeDehyphenizer:dehyphen", e); e.printStackTrace(); } normalizeDocument(); if(listener!= null) incrementProgressBar(100); } private void createWordTable(){ try{ Statement stmt = conn.createStatement(); String query = "create table if not exists "+tablename+" (word varchar(50) unique not null primary key, count int)"; stmt.execute(query); stmt.execute("delete from "+tablename); }catch(Exception e){ LOGGER.error("Problem in VolumeDehyphenizer:createWordTable", e); e.printStackTrace(); } } private void fillInWords(){ try { Statement stmt = conn.createStatement(); File[] flist = folder.listFiles(); for(int i= 0; i < flist.length; i++){ //System.out.println("read "+flist[i].getName()); //MainForm.markUpPerlLog.append("read "+flist[i].getName()+"\n"); BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; while ((line = reader.readLine()) != null) { line = line.toLowerCase(); String linec = line; //if(line.matches(".*?\\d+-(?=[a-z]).*")){ // line = fixNumTextMix(line, flist[i]); //} line = line.replaceAll("<[^<]+?>", " "); //for xml or html docs line = line.replaceAll(num, " "); line = line.replaceAll("[^-a-z]", " "); line = normalize(line); //System.err.println("line has changed from \n"+linec+" to \n"+line); String[] words = line.split("\\s+"); for(int j = 0; j < words.length; j++){ String w = words[j].trim(); if(w.matches(".*?\\w.*")){ int count = 1; ResultSet rs = stmt.executeQuery("select word, count from "+tablename+" where word='"+w+"'"); if(rs.next()){ count = rs.getInt("count")+1; } stmt.execute("delete from "+tablename+" where word ='"+w+"'"); stmt.execute("insert into "+tablename+" values('"+w+"', "+count+")"); } } } reader.close(); } } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:fillInWords", e); e.printStackTrace(); } } private String fixBrokenHyphens(String broken){ //cup-[,] disc-[,] or dish-shaped StringBuffer fixed = new StringBuffer(); Pattern p = Pattern.compile("(.*?\\b)([a-z]+)-\\W[^\\.]*?[a-z]+-([a-z]+)(.*)"); Matcher m = p.matcher(broken); while(m.matches()){ String begin = m.group(1); String part = broken.substring(m.start(2), m.start(3)); broken = m.group(4); String fix = m.group(3); part = part.replaceAll("-(?!\\w)", "-"+fix); fixed.append(begin+part); m = p.matcher(broken); } fixed.append(broken); return fixed.toString(); } private void normalizeDocument(){ try { File[] flist = folder.listFiles(); for(int i= 0; i < flist.length; i++){ BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; //DO NOT normalize case StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); text = normalize(text); text = performMapping(text); //write back //System.out.println(text); File outf = new File(outfolder, flist[i].getName()); //BufferedWriter out = new BufferedWriter(new FileWriter(flist[i])); BufferedWriter out = new BufferedWriter(new FileWriter(outf)); out.write(text); out.close(); //System.out.println(flist[i].getName()+" dehyphenized"); //MainForm.markUpPerlLog.append(flist[i].getName()+" dehyphenized\n"); } } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:normalizeDocument", e); e.printStackTrace(); } } private String normalize(String text){ text = text.replaceAll("-+", "-"); Pattern p = Pattern.compile("(.*?\\W)-(.*)"); //remove proceeding - Matcher m = p.matcher(text); while(m.matches()){ text = m.group(1)+" "+m.group(2); m = p.matcher(text); } p = Pattern.compile("(.*?)-(\\W.*)"); //remove trailing m = p.matcher(text); while(m.matches()){ text = m.group(1)+" "+m.group(2); m = p.matcher(text); } //text = text.replaceAll("\\W-", " "); //text = text.replaceAll("-\\W", " "); //HOng, 08/04/09 for FoC doc. "-" added in place of <dox-tags>. //if(line.matches(".*?[a-z]- .*")){//cup- disc- or dish-shaped // line = fixBrokenHyphens(line); //Too loose. //} //if(text.matches(".*?[a-z]-[^a-z0-9].*")){//cup- disc- or dish-shaped //text = fixBrokenHyphens(text); //} return text; } private String performMapping(String original){ Enumeration en = mapping.keys(); while(en.hasMoreElements()){ String hword = (String)en.nextElement(); String dhword = (String)mapping.get(hword); //System.out.println("hword: "+hword +" dhword: "+dhword); if(!hword.equals(dhword) && !hword.startsWith("-") && !hword.endsWith("-")){ //replace those in lower cases original = original.replaceAll(hword, dhword); //hyphen those phrases that are hyphened once String dhw = dhword.replaceAll("-", " "); //cup-shaped => cup shaped original = original.replaceAll(dhw, dhword); //cup shaped =>cup-shaped //upper cases hword = hword.toUpperCase().substring(0,1)+hword.substring(1); dhword = dhword.toUpperCase().substring(0,1)+dhword.substring(1); original = original.replaceAll(hword, dhword); dhw = dhword.replaceAll("-", " "); //Cup-shaped => Cup shaped original = original.replaceAll(dhw, dhword); //Cup shaped =>Cup-shaped } } return original; } */ /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub String workdir = "C:/FOC-v11/target"; String todofoldername = "descriptions"; //this.database = "focv11_corpus"; //VolumeDehyphenizer vd = new VolumeDehyphenizer(null, workdir, todofoldername, database); // vd.dehyphen(); }// }