package de.unihd.dbs.uima.annotator.heideltime.resources; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.TreeMap; import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; /** * * This class fills the role of a manager of all the RePattern resources. * It reads the data from a file system and fills up a bunch of HashMaps * with their information. * @author jannik stroetgen * */ public class RePatternManager extends GenericResourceManager { protected static HashMap<String, RePatternManager> instances = new HashMap<String, RePatternManager>(); // STORE PATTERNS AND NORMALIZATIONS private TreeMap<String, String> hmAllRePattern; /** * Constructor calls the parent constructor that sets language/resource * parameters and collects resource repatterns. * @param language * @param load_temponym_resources */ private RePatternManager(String language, Boolean load_temponym_resources) { // calls the Generic constructor with repattern parameter super("repattern", language); // initialize the member map of all repatterns hmAllRePattern = new TreeMap<String, String>(); ////////////////////////////////////////////////////// // READ PATTERN RESOURCES FROM FILES AND STORE THEM // ////////////////////////////////////////////////////// ResourceScanner rs = ResourceScanner.getInstance(); ResourceMap hmResourcesRePattern = rs.getRepatterns(language); for (String which : hmResourcesRePattern.keySet()) { hmAllRePattern.put(which, ""); } readRePatternResources(hmResourcesRePattern, load_temponym_resources); } /** * singleton producer. * @return singleton instance of RePatternManager */ public static RePatternManager getInstance(Language language, Boolean load_temponym_resources) { if(!instances.containsKey(language.getName())) { RePatternManager nm = new RePatternManager(language.getResourceFolder(), load_temponym_resources); instances.put(language.getName(), nm); } return instances.get(language.getName()); } /** * READ THE REPATTERN FROM THE FILES. The files have to be defined in the HashMap hmResourcesRePattern. * @param hmResourcesRePattern RePattern resources to be interpreted * @param load_temponym_resources whether temponym resources are to be read */ private void readRePatternResources(ResourceMap hmResourcesRePattern, Boolean load_temponym_resources) { ////////////////////////////////////// // READ REGULAR EXPRESSION PATTERNS // ////////////////////////////////////// InputStream is = null; InputStreamReader isr = null; BufferedReader br = null; try { for (String resource : hmResourcesRePattern.keySet()) { // read pattern resources with "Temponym" only if temponym tagging is selected if ( (!(resource.contains("Temponym"))) || ((load_temponym_resources) && (resource.contains("Temponym")))){ Logger.printDetail(component, "Adding pattern resource: "+resource); // create a buffered reader for every repattern resource file is = hmResourcesRePattern.getInputStream(resource); isr = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(isr); LinkedList<String> patterns = new LinkedList<String>(); for (String line; (line = br.readLine()) != null; ) { // disregard comments if (!line.startsWith("//") && !line.equals("")) { patterns.add(replaceSpaces(line)); } } // sort the repatterns by length in ascending order Collections.sort(patterns, new Comparator<String>() { @Override public int compare(String o1, String o2) { String o1effective = o1.replaceAll("\\[[^\\]]*\\]", "X") .replaceAll("\\?", "") .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1"); String o2effective = o2.replaceAll("\\[[^\\]]*\\]", "X") .replaceAll("\\?", "") .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1"); if(o1effective.length() < o2effective.length()) return 1; else if(o1effective.length() > o2effective.length()) return -1; else return 0; } }); StringBuilder sb = new StringBuilder(); String devPattern = ""; for(String pat : patterns) { sb.append("|"); sb.append(pat); } devPattern = sb.toString(); hmAllRePattern.put(resource, devPattern); } else { Logger.printDetail(component, "No Temponym Tagging selected. Skipping pattern resource: "+resource); } } //////////////////////////// // FINALIZE THE REPATTERN // //////////////////////////// for (String which : hmAllRePattern.keySet()) { if ( (!(which.contains("Temponym"))) || ((load_temponym_resources) && (which.contains("Temponym")))){ finalizeRePattern(which, hmAllRePattern.get(which)); } } } catch (IOException e) { e.printStackTrace(); } finally { try { if(br != null) { br.close(); } if(isr != null) { isr.close(); } if(is != null) { is.close(); } } catch(Exception e) { e.printStackTrace(); } } } /** * Pattern containing regular expression is finalized, i.e., created correctly and added to hmAllRePattern. * @param name key name * @param rePattern repattern value */ private void finalizeRePattern(String name, String rePattern) { // create correct regular expression rePattern = rePattern.replaceFirst("\\|", ""); /* this was added to reduce the danger of getting unusable groups from user-made repattern * files with group-producing parentheses (i.e. "(foo|bar)" while matching against the documents. */ rePattern = rePattern.replaceAll("\\(([^\\?])", "(?:$1"); rePattern = "(" + rePattern + ")"; rePattern = rePattern.replaceAll("\\\\", "\\\\\\\\"); // add rePattern to hmAllRePattern hmAllRePattern.put(name, rePattern); } /** * proxy method to access the hmAllRePattern member * @param key key to check for * @return whether the map contains the key */ public Boolean containsKey(String key) { return hmAllRePattern.containsKey(key); } /** * proxy method to access the hmAllRePattern member * @param key Key to retrieve data from * @return String from the map */ public String get(String key) { return hmAllRePattern.get(key); } }