package de.unihd.dbs.uima.annotator.heideltime.resources; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.regex.MatchResult; import java.util.regex.Pattern; import de.unihd.dbs.uima.annotator.heideltime.utilities.*; /** * * This class fills the role of a manager of all the Normalization resources. * It reads the data from a file system and fills up a bunch of HashMaps * with their information. * @author jannik stroetgen * */ public class NormalizationManager extends GenericResourceManager { protected static HashMap<String, NormalizationManager> instances = new HashMap<String, NormalizationManager>(); // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION" private Pattern paReadNormalizations = Pattern.compile("\"(.*?)\",\"(.*?)\""); // STORE PATTERNS AND NORMALIZATIONS private HashMap<String, RegexHashMap<String>> hmAllNormalization; // ACCESS TO SOME NORMALIZATION MAPPINGS (set internally) private HashMap<String, String> normDayInWeek; private HashMap<String, String> normNumber; private HashMap<String, String> normMonthName; private HashMap<String, String> normMonthInSeason; private HashMap<String, String> normMonthInQuarter; /** * Constructor calls the parent constructor that sets language/resource parameters, * initializes basic and collects resource normalization patterns. * @param language * @param load_temponym_resources */ private NormalizationManager(String language, Boolean load_temponym_resources) { // calls the Generic constructor with normalization-parameter super("normalization", language); // initialize the data structures hmAllNormalization = new HashMap<String, RegexHashMap<String>>(); normNumber = new HashMap<String, String>(); normDayInWeek = new HashMap<String, String>(); normMonthName = new HashMap<String, String>(); normMonthInSeason = new HashMap<String, String>(); normMonthInQuarter = new HashMap<String, String>(); // GLOBAL NORMALIZATION INFORMATION readGlobalNormalizationInformation(); //////////////////////////////////////////////////////////// // READ NORMALIZATION RESOURCES FROM FILES AND STORE THEM // //////////////////////////////////////////////////////////// ResourceScanner rs = ResourceScanner.getInstance(); ResourceMap hmResourcesNormalization = rs.getNormalizations(language); for (String which : hmResourcesNormalization.keySet()) { hmAllNormalization.put(which, new RegexHashMap<String>()); } readNormalizationResources(hmResourcesNormalization, load_temponym_resources); } /** * singleton producer. * @return singleton instance of NormalizationManager */ public static NormalizationManager getInstance(Language language, Boolean load_temponym_resources) { if(!instances.containsKey(language.getName())) { NormalizationManager nm = new NormalizationManager(language.getResourceFolder(), load_temponym_resources); instances.put(language.getName(), nm); } return instances.get(language.getName()); } /** * Read the resources (of any language) from resource files and * fill the HashMaps used for normalization tasks. * @param hmResourcesNormalization normalization patterns to be interpreted * @param load_temponym_resources whether temponym resources are loaded */ public void readNormalizationResources(ResourceMap hmResourcesNormalization, Boolean load_temponym_resources) { InputStream is = null; InputStreamReader isr = null; BufferedReader br = null; try { for (String resource : hmResourcesNormalization.keySet()) { // read normalization resources with "Temponym" only if temponym tagging is selected if ( (!(resource.contains("Temponym"))) || ((load_temponym_resources) && (resource.contains("Temponym")))){ Logger.printDetail(component, "Adding normalization resource: "+resource); // create a buffered reader for every normalization resource file is = hmResourcesNormalization.getInputStream(resource); isr = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(isr); for ( String line; (line=br.readLine()) != null; ) { if (line.startsWith("//")) continue; // ignore comments // check each line for the normalization format (defined in paReadNormalizations) boolean correctLine = false; for (MatchResult r : Toolbox.findMatches(paReadNormalizations, line)) { correctLine = true; String resource_word = replaceSpaces(r.group(1)); String normalized_word = r.group(2); for (String which : hmAllNormalization.keySet()) { if (resource.equals(which)) { hmAllNormalization.get(which).put(resource_word,normalized_word); } } if ((correctLine == false) && (!(line.matches("")))) { Logger.printError("["+component+"] Cannot read one of the lines of normalization resource "+resource); Logger.printError("["+component+"] Line: "+line); } } } } else { Logger.printDetail(component, "No Temponym Tagging selected. Skipping normalization resource: "+resource); } } } catch (IOException e) { e.printStackTrace(); } finally { try { if(br != null) { br.close(); } if(isr != null) { isr.close(); } if(is != null) { is.close(); } } catch(Exception e) { e.printStackTrace(); } } } /** * sets a couple of rudimentary normalization parameters */ private void readGlobalNormalizationInformation() { // MONTH IN QUARTER normMonthInQuarter.put("01","1"); normMonthInQuarter.put("02","1"); normMonthInQuarter.put("03","1"); normMonthInQuarter.put("04","2"); normMonthInQuarter.put("05","2"); normMonthInQuarter.put("06","2"); normMonthInQuarter.put("07","3"); normMonthInQuarter.put("08","3"); normMonthInQuarter.put("09","3"); normMonthInQuarter.put("10","4"); normMonthInQuarter.put("11","4"); normMonthInQuarter.put("12","4"); // MONTH IN SEASON normMonthInSeason.put("", ""); normMonthInSeason.put("01","WI"); normMonthInSeason.put("02","WI"); normMonthInSeason.put("03","SP"); normMonthInSeason.put("04","SP"); normMonthInSeason.put("05","SP"); normMonthInSeason.put("06","SU"); normMonthInSeason.put("07","SU"); normMonthInSeason.put("08","SU"); normMonthInSeason.put("09","FA"); normMonthInSeason.put("10","FA"); normMonthInSeason.put("11","FA"); normMonthInSeason.put("12","WI"); // DAY IN WEEK normDayInWeek.put("sunday","1"); normDayInWeek.put("monday","2"); normDayInWeek.put("tuesday","3"); normDayInWeek.put("wednesday","4"); normDayInWeek.put("thursday","5"); normDayInWeek.put("friday","6"); normDayInWeek.put("saturday","7"); normDayInWeek.put("Sunday","1"); normDayInWeek.put("Monday","2"); normDayInWeek.put("Tuesday","3"); normDayInWeek.put("Wednesday","4"); normDayInWeek.put("Thursday","5"); normDayInWeek.put("Friday","6"); normDayInWeek.put("Saturday","7"); // normDayInWeek.put("sunday","7"); // normDayInWeek.put("monday","1"); // normDayInWeek.put("tuesday","2"); // normDayInWeek.put("wednesday","3"); // normDayInWeek.put("thursday","4"); // normDayInWeek.put("friday","5"); // normDayInWeek.put("saturday","6"); // normDayInWeek.put("Sunday","7"); // normDayInWeek.put("Monday","1"); // normDayInWeek.put("Tuesday","2"); // normDayInWeek.put("Wednesday","3"); // normDayInWeek.put("Thursday","4"); // normDayInWeek.put("Friday","5"); // normDayInWeek.put("Saturday","6"); // NORM MINUTE normNumber.put("0","00"); normNumber.put("00","00"); normNumber.put("1","01"); normNumber.put("01","01"); normNumber.put("2","02"); normNumber.put("02","02"); normNumber.put("3","03"); normNumber.put("03","03"); normNumber.put("4","04"); normNumber.put("04","04"); normNumber.put("5","05"); normNumber.put("05","05"); normNumber.put("6","06"); normNumber.put("06","06"); normNumber.put("7","07"); normNumber.put("07","07"); normNumber.put("8","08"); normNumber.put("08","08"); normNumber.put("9","09"); normNumber.put("09","09"); normNumber.put("10","10"); normNumber.put("11","11"); normNumber.put("12","12"); normNumber.put("13","13"); normNumber.put("14","14"); normNumber.put("15","15"); normNumber.put("16","16"); normNumber.put("17","17"); normNumber.put("18","18"); normNumber.put("19","19"); normNumber.put("20","20"); normNumber.put("21","21"); normNumber.put("22","22"); normNumber.put("23","23"); normNumber.put("24","24"); normNumber.put("25","25"); normNumber.put("26","26"); normNumber.put("27","27"); normNumber.put("28","28"); normNumber.put("29","29"); normNumber.put("30","30"); normNumber.put("31","31"); normNumber.put("32","32"); normNumber.put("33","33"); normNumber.put("34","34"); normNumber.put("35","35"); normNumber.put("36","36"); normNumber.put("37","37"); normNumber.put("38","38"); normNumber.put("39","39"); normNumber.put("40","40"); normNumber.put("41","41"); normNumber.put("42","42"); normNumber.put("43","43"); normNumber.put("44","44"); normNumber.put("45","45"); normNumber.put("46","46"); normNumber.put("47","47"); normNumber.put("48","48"); normNumber.put("49","49"); normNumber.put("50","50"); normNumber.put("51","51"); normNumber.put("52","52"); normNumber.put("53","53"); normNumber.put("54","54"); normNumber.put("55","55"); normNumber.put("56","56"); normNumber.put("57","57"); normNumber.put("58","58"); normNumber.put("59","59"); normNumber.put("60","60"); // NORM MONTH normMonthName.put("january","01"); normMonthName.put("february","02"); normMonthName.put("march","03"); normMonthName.put("april","04"); normMonthName.put("may","05"); normMonthName.put("june","06"); normMonthName.put("july","07"); normMonthName.put("august","08"); normMonthName.put("september","09"); normMonthName.put("october","10"); normMonthName.put("november","11"); normMonthName.put("december","12"); } /* * a bunch of getter methods to facilitate access to the data structures */ public final RegexHashMap<String> getFromHmAllNormalization(String key) { return hmAllNormalization.get(key); } public final String getFromNormNumber(String key) { return normNumber.get(key); } public final String getFromNormDayInWeek(String key) { return normDayInWeek.get(key); } public final String getFromNormMonthName(String key) { return normMonthName.get(key); } public final String getFromNormMonthInSeason(String key) { return normMonthInSeason.get(key); } public final String getFromNormMonthInQuarter(String key) { return normMonthInQuarter.get(key); } }