package edu.cmu.minorthird.text; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.log4j.Logger; import edu.cmu.minorthird.text.gui.TextBaseViewer; import edu.cmu.minorthird.text.learn.SampleClassificationProblem; import edu.cmu.minorthird.text.learn.SampleExtractionProblem; import edu.cmu.minorthird.text.mixup.Mixup; import edu.cmu.minorthird.text.mixup.MixupInterpreter; import edu.cmu.minorthird.text.mixup.MixupProgram; /** * Configurable method of loading data objects. * * @author William Cohen */ public class FancyLoader { private static Logger log = Logger.getLogger(FancyLoader.class); /** Property defining root of repository */ public static final String REPOSITORY_PROP = "edu.cmu.minorthird.repository"; /** Property defining location of raw data */ public static final String DATADIR_PROP = "edu.cmu.minorthird.dataDir"; /** Property defining location of labels added to data */ public static final String LABELDIR_PROP = "edu.cmu.minorthird.labelDir"; /** Property defining location of scripts for loading data */ public static final String SCRIPTDIR_PROP = "edu.cmu.minorthird.scriptDir"; /** When to expect sgml markup */ public static final String SGML_MARKUP_PATTERN_PROP = "edu.cmu.minorthird.sgmlPattern"; // // initialization of properties // private static Properties props = new Properties(); private static boolean dataPropertiesFound = false; static { try { InputStream in = FancyLoader.class.getClassLoader().getResourceAsStream("data.properties"); if (in != null) { props.load(in); log.debug("Loaded properties from stream "+in); dataPropertiesFound = true; } else { log.info("No data.properties found on classpath"); dataPropertiesFound = false; } } catch (IOException e) { throw new IllegalStateException("error getting data.properties: "+e); } // override data.properties with command line, if a flag is present String[] ps = new String[] { REPOSITORY_PROP, DATADIR_PROP, LABELDIR_PROP, SCRIPTDIR_PROP, SGML_MARKUP_PATTERN_PROP }; for (int i=0; i<ps.length; i++) { if (System.getProperty(ps[i])!=null) props.setProperty(ps[i], System.getProperty(ps[i])); } // fill in default values for DATADIR_PROP,LABELDIR_PROP,SCRIPTDIR_PROP relative to REPOSITORY_PROP String defaultRepositoryValue = System.getProperty(REPOSITORY_PROP,"."); String defaultSGMLPattern = System.getProperty(SGML_MARKUP_PATTERN_PROP,".*"); if (dataPropertiesFound) { if (props.getProperty(REPOSITORY_PROP)==null) { props.setProperty(REPOSITORY_PROP,defaultRepositoryValue); } if (props.getProperty(DATADIR_PROP)==null) { props.setProperty(DATADIR_PROP,props.getProperty(REPOSITORY_PROP)+"/data"); } if (props.getProperty(LABELDIR_PROP)==null) { props.setProperty(LABELDIR_PROP,props.getProperty(REPOSITORY_PROP)+"/labels"); } if (props.getProperty(SCRIPTDIR_PROP)==null) { props.setProperty(SCRIPTDIR_PROP,props.getProperty(REPOSITORY_PROP)+"/loaders"); } if (props.getProperty(SGML_MARKUP_PATTERN_PROP)==null) { props.setProperty(SGML_MARKUP_PATTERN_PROP,defaultSGMLPattern); } } else { props.setProperty(SGML_MARKUP_PATTERN_PROP,defaultSGMLPattern); props.setProperty(DATADIR_PROP,defaultRepositoryValue); props.setProperty(LABELDIR_PROP,defaultRepositoryValue); props.setProperty(SCRIPTDIR_PROP,defaultRepositoryValue); } log.info("dataDir: "+props.getProperty(DATADIR_PROP)); log.info("labelDir: "+props.getProperty(LABELDIR_PROP)); log.info("scriptDir: "+props.getProperty(SCRIPTDIR_PROP)); log.info("expect SGML in files matching '"+props.getProperty(SGML_MARKUP_PATTERN_PROP)+"'"); }; /** Return an array of a possible arguments to FancyLoader.loadTextLabels() */ public static Object[] getPossibleTextLabelKeys() { List<String> result = new ArrayList<String>(); for (int i=1; i<=3; i++) { result.add( "sample"+i+".train"); result.add( "sample"+i+".test" ); } result.add("sample3.unlabeled"); File dir = new File(props.getProperty(SCRIPTDIR_PROP)); if (dir!=null) { String[] files = dir.list(); for (int i=0; files!=null && i<files.length; i++) { result.add( files[i]); } } return result.toArray(); } /** * Try to load a TextLabels object 'foo' in one of these ways. * * <ol> * <li>If 'foo' is "sampleK.train" or "sampleK.test" for K=1,2,3 * then a hard-coded small sample TextLabels object will be returned. * * <li>If 'foo' is the name of a file, treat it as a bean shell * script, and return the result of executing it. * * <li>If script is a file stem "foo" and a file "foo.base" exists, * load a textBase from "foo.base" (one document per line, line name used * as document id). * * <li>If script is a file stem "foo" and a directory "foo" exists, * load a textBase from "foo" (one document per file). * * <li>If a file named "data.properties" is on the classpath, and * 'foo' is the name of a file in the value of the parameter * edu.cmu.minorthird.scriptDir, as defined in data.properties, * treat that file as a bean shell script, and return the result * of executing it. When the script is executed, the variables * "dataDir" and "labelDir" will be bound to Files defined by * edu.cmu.minorthird.dataDir and edu.cmu.minorthird.labelDir. * </ol> * * SGML markup in the files "foo/*" or "foo.base" will be * interpreted as annotations iff "foo" matches the regex defined by * edu.cmu.minorthird.sgmlPattern. After any SGML markup is * interpreted, FancyLoader will look for additional labels in * "foo.labels" or "foo.mixup", in that order. * * @param script the name of the bean shell script, directory, file, ... * @return TextLabels object */ public static TextLabels loadTextLabels(String script) { if ("sample1.train".equals(script)) return SampleExtractionProblem.trainLabels(); else if ("sample1.test".equals(script)) return SampleExtractionProblem.testLabels(); if ("sample2.train".equals(script)) return SampleExtractionProblem.taggerTrainLabels(); else if ("sample2.test".equals(script)) return SampleExtractionProblem.taggerTestLabels(); if ("sample3.train".equals(script)) return SampleClassificationProblem.trainLabels(); else if ("sample3.test".equals(script)) return SampleClassificationProblem.testLabels(); if ("sample3.unlabeled".equals(script)) return SampleClassificationProblem.unlabeled(); String scriptDir = getProperty(SCRIPTDIR_PROP); File f = new File(new File(scriptDir), script); if (f.exists() && !f.isDirectory()) { log.info("Loading using beanShell script "+f); try { Object obj = loadObject(script); if (obj!=null && obj instanceof TextLabels) { return (TextLabels)obj; } else { throw new IllegalArgumentException( "script "+script+" from dir "+scriptDir+" returns an invalid object: "+obj); } } catch (bsh.EvalError e) { log.info("Error running beanShell script "+f+": "+e); } catch (IOException e) { log.info("Error loading bean shell file "+f+": "+e); } } File baseFile = new File(script + ".base"); File baseDir = new File(script); boolean sgmlExpected = false; System.out.println("The script name is: " + script); String pattern = props.getProperty(SGML_MARKUP_PATTERN_PROP); try { sgmlExpected = Pattern.compile(pattern).matcher(script).matches(); log.info("Pattern '"+pattern+"' " +(sgmlExpected?"does":"does not")+" match '"+script+"' so SGML markup " +(sgmlExpected?"is":"is not")+" expected in documents"); } catch (PatternSyntaxException ex) { log.error("can't match illegal "+SGML_MARKUP_PATTERN_PROP+" regex: "+pattern); } try { TextBase base = null; TextBaseLoader tbl = null; if (baseDir.exists() && baseDir.isDirectory()) { log.info("Loading documents from files in directory "+baseDir); tbl = new TextBaseLoader( TextBaseLoader.DOC_PER_FILE,sgmlExpected); try{ base = tbl.load(baseDir); } catch(Exception e) { e.printStackTrace(); } } else if (baseFile.exists()) { log.info("Loading documents from lines in file "+baseFile); tbl = new TextBaseLoader( TextBaseLoader.DOC_PER_LINE,sgmlExpected); try { base = tbl.load(baseFile); } catch(Exception e) { e.printStackTrace(); } } if (base==null) { log.info("Expected to find beanShell script in "+scriptDir+"/"+script +" or else a file named '"+baseFile+"' or a directory named '"+baseDir+"'"); log.error("Can't find documents for key '"+script+"'"); return null; } MonotonicTextLabels labels = sgmlExpected ? tbl.getLabels() : new BasicTextLabels(base); File labelFile = new File(script + ".labels"); if (labelFile.exists()) { log.info("Loading annotations from "+labelFile); new TextLabelsLoader().importOps((MutableTextLabels)labels,base,labelFile); // frank: trying to fix this bug "ignoring 001.txt because token 0 not labeled in Span..." new TextLabelsLoader().closeLabels((MutableTextLabels)labels,TextLabelsLoader.CLOSE_ALL_TYPES); } File mixupFile = new File(script + ".mixup"); if (mixupFile.exists()) { log.info("Adding annotations with "+mixupFile); MixupInterpreter interp = new MixupInterpreter(new MixupProgram(mixupFile)); interp.eval(labels); labels = interp.getCurrentLabels(); } return labels; } catch (IOException ex) { log.error("IO error loading '"+script+"': "+ex); } catch (Mixup.ParseException ex) { log.error("Mixup error loading '"+script+"': "+ex); } /*catch (java.text.ParseException ex) { log.error("Error loading textbase '"+script+"': "+ex); }*/ log.error("no data found for key: "+script); return null; } private static Object loadObject(String script) throws bsh.EvalError,IOException { String dataDir = getProperty(DATADIR_PROP); String labelDir = getProperty(LABELDIR_PROP); String scriptDir = getProperty(SCRIPTDIR_PROP); log.debug("loading with dataDir: "+dataDir+" labelDir: "+labelDir+" scriptDir: "+scriptDir); File f = new File(new File(scriptDir),script); if (!f.exists()) throw new IllegalArgumentException("can't find file "+f.getAbsolutePath()); log.debug("loading object defined by "+f.getAbsolutePath()); bsh.Interpreter interpreter = new bsh.Interpreter(); interpreter.set("dataDir", new File(dataDir)); interpreter.set("labelDir", new File(labelDir)); return interpreter.source(f.getAbsolutePath()); } public static String getProperty(String prop) { String v = System.getProperty(prop); return v!=null ? v : props.getProperty(prop); } static public void main(String[] args) throws bsh.EvalError, IOException { Object o = FancyLoader.loadObject(args[0]); System.out.println("loaded "+o); if (o instanceof TextLabels) { TextBaseViewer.view((TextLabels) o ); } } }