package fna.parsing.character; import java.sql.Connection; import java.sql.DriverManager; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import org.apache.log4j.Logger; import fna.parsing.ApplicationUtilities; public class Bootstrap implements Comparator<StateGroup>{ private ArrayList<StateGroup> source; private Glossary glossary; //private String tablename; //save the discoveries made from bootstrapping iterations static private Connection conn = null; @SuppressWarnings("unused") static private String username = ApplicationUtilities.getProperty("database.username"); @SuppressWarnings("unused") static private String password = ApplicationUtilities.getProperty("database.password"); @SuppressWarnings("unused") private static final Logger LOGGER = Logger.getLogger(Bootstrap.class); public Bootstrap(ArrayList<StateGroup> source, Glossary glossary, String database) { //public Bootstrap(ArrayList source, String database) { try{ if(conn == null){ Class.forName(ApplicationUtilities.getProperty("database.driverPath")); String URL = ApplicationUtilities.getProperty("database.url"); conn = DriverManager.getConnection(URL); } }catch(Exception e){ e.printStackTrace(); } this.source = source; this.glossary = glossary; //this.tablename = learnttable; } /*public Bootstrap(ArrayList source, Glossary glossary, String database, String learnttable) { try{ if(conn == null){ Class.forName("com.mysql.jdbc.Driver"); String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password; conn = DriverManager.getConnection(URL); } }catch(Exception e){ e.printStackTrace(); } this.source = source; this.glossary = glossary; this.tablename = learnttable; }*/ public void go(){ int total = 0; int count = 0; do{ count = 0; Collections.sort(source, this);//resort the source Iterator<StateGroup> it = source.iterator(); while(it.hasNext()){ StateGroup sg = (StateGroup)it.next(); count += inferCharacters(sg);//add discoveries (term, category) to glossary } total += count; }while(count > 0); System.out.println("learned categories for "+total+" states"); //TODO list orphaned/free states } /** * all states in a group may or may not share one category * @param sg * @return the number of terms inserted into glossary */ @SuppressWarnings("static-access") public int inferCharacters(StateGroup sg){ int count = 0; //single state case, skip if(sg.size() <=1){ return count; } //known unknown case //A/B ? :return 0 //A ?: ?=A if(sg.size() == 2 && sg.numberOfAssociated() == 1 && sg.getCount() > 3){ ArrayList<String> cats = sg.seenCategories(); if(cats.size() >1){ return count; } ArrayList<?> unknowns = sg.nonCategoryStates((String)cats.get(0)); glossary.addInducedPair(((State)unknowns.get(0)).toString(), cats); count++; System.out.println(((State)unknowns.get(0)).toString()+" is labeled as ["+(String)cats.get(0)+"], in group "+sg.toString()+"--a new discovery========"); return count; } // known unknown known // unknown= shared if exist // unknown = majority of the states to the right of unknown if(sg.size() > 2 && sg.numberOfAssociated() >=2 && sg.numberOfAssociated() <sg.size() && sg.getCount() >=2){ //make sure the unknown is not the first state in the group String mostfreq = sg.mostFreqCategory(); String[] mf = mostfreq.split("#");// "position#111"; String cat = mf[0]; int freq = mf[1].length(); if(freq ==sg.numberOfAssociated()){ //found shared by all ArrayList<?> unknowns = sg.nonCategoryStates(cat); Iterator<?> it = unknowns.iterator(); while(it.hasNext()){ State s = (State)it.next(); int pos = sg.getIndex(s); if(pos != 0){ ArrayList<String> cats = new ArrayList<String>(); cats.add(cat); glossary.addInducedPair(s.toString(), cats); count++; System.out.println(s.toString()+" is labeled as ["+cat+"], in group "+sg.toString()+"--a new discovery========"); } } return count; //states have a shared character } StateGroup sgnew = new StateGroup(); for(int i = 1; i < sg.size(); i++){ sgnew.addState(sg.getState(i)); } mostfreq = sgnew.mostFreqCategory(); mf = mostfreq.split("#");// "position#111"; cat = mf[0]; freq = mf[1].length(); ArrayList<?> unknowns = sgnew.nonCategoryStates(cat); Iterator<?> it = unknowns.iterator(); while(it.hasNext()){ State s = (State)it.next(); ArrayList<String> cats = new ArrayList<String>(); cats.add(cat); glossary.addInducedPair(s.toString(), cats); count++; System.out.println(s.toString()+" is labeled as ["+cat+"], in group "+sg.toString()+"--a new discovery========"); } return count; //states have a shared character } if(sg.freeStates().size() > 0){ System.out.println("not processed group: "+sg.toString()+"count: "+sg.getCount()); System.out.println("\t free states: "+sg.freeStates().toString()); System.out.println("\tfirst states: "+sg.getState(0).toString()); System.out.println("\t most freq cat: "+sg.mostFreqCategory()); } return count; } /*String mostfreq = sg.mostFreqCategory(); if(mostfreq.compareTo("#")==0){ System.out.println(sg.toString()+"group is unknown at this time"); return count; } String[] mf = mostfreq.split("#");// "position#111"; String cat = mf[0]; int freq = mf[1].length(); if(freq ==sg.size()){ System.out.println(sg.toString()+" is labeled as ["+cat+"]"); return count; //states have a shared character } //entire [margin], lobed [solid shape][plane shape] ArrayList states = sg.nonCategoryStates(cat); int number = sg.numberOfAssociated(); //make the most frequent category the category for free states //If the categories are uniformed distributed, withhold the decision for this iteration float v = (float)freq/number; if(v >= 0.66){ Iterator it = states.iterator(); while(it.hasNext()){ ArrayList list = new ArrayList(); list.add(cat); String temp = ((State)it.next()).toString(); glossary.addInducedPair(temp,list); count++; System.out.println(temp+" is labeled as ["+cat+"], in group "+sg.toString()+"--a new discovery========"); } }else{ Iterator it = states.iterator(); while(it.hasNext()){ String temp = ((State)it.next()).toString(); System.out.println(temp+" is unknown at this time, in group "+sg.toString()); } } return count; }*/ /** * order by numberOfAssociated, size */ public int compare(StateGroup g1, StateGroup g2){ //if(g1.toString().compareTo(g2.toString()) == 0){ // return 0; //} each g is different, not possible to return 0 //int known1 = g1.numberOfAssociated(); //int known2 = g2.numberOfAssociated(); int count1 = g1.getCount(); int count2 = g2.getCount(); //int v =known1 - known2; //if(v == 0){ return count2 - count1; //any order is fine //}else{ //return v; //} } }