package org.opensextant.regex; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class RegexMatcher { /** The rules used by this RegexMatcher. */ List<RegexRule> rules = new ArrayList<RegexRule>(); /** The list of entity type this matcher can find. */ Set<String> types = new HashSet<String>(); /** The postprocessors to apply. */ Map<PostProcessor, Set<String>> posters = new HashMap<PostProcessor, Set<String>>(); /** * Future stuff: named groups useful? String namedGroup = * "\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"; Pattern namedGroupPattern = * Pattern.compile(namedGroup); the pattern of a DEFINE within a RULE e.g * "<somePiece>" NOTE: should we look for existing (not DEFINEDd) capture * groups */ String elementRegex = "<[a-zA-Z0-9_]+>"; Pattern elementPattern = Pattern.compile(elementRegex); /** Has this mather been sucessfully initialized. */ boolean isInited; /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(RegexMatcher.class); public RegexMatcher(URL patternFile) { initialize(patternFile); } public RegexMatcher(File patternFile) { initialize(patternFile); } public List<RegexAnnotation> match(String input) { // The matches to return List<RegexAnnotation> matches = new ArrayList<RegexAnnotation>(); if (!isInited) { LOGGER.error("Tried to use RegexMatcher without initializing first"); return matches; } for (RegexRule r : rules) { String t = r.getEntityType(); Normalizer normer = r.getNormalizer(); // Do the matching, looping over the rules Matcher matcher = r.getPattern().matcher(input); while (matcher.find()) { // for each hit from the regex, create a RegexAnnotation RegexAnnotation tmp = new RegexAnnotation(t, matcher.group(0), matcher.start(), matcher.end()); // if the a normalizer has been specified, if (normer != null) { normer.normalize(tmp, r, matcher.toMatchResult()); } // check to see if the normalizer declared the match invalid if (tmp.isValid()) { // add the "hierarchy" and "isEntity" features String tmpHier = r.getTaxo(); if (tmpHier != null && tmpHier.trim().length() > 0) { tmp.getFeatures().put("hierarchy", tmpHier); tmp.getFeatures().put("isEntity", true); } tmp.setRule(r.getRuleFamily() + "-" + r.getRuleName()); matches.add(tmp); } } } // run the matches through the postprocessor(s) if any specified for (PostProcessor p : posters.keySet()) { p.postProcess(matches, posters.get(p)); } return matches; } public void initialize(URL patFile) { // the #DEFINE statements as name and regex Map<String, String> defines = new HashMap<String, String>(); // the #NORM statements as entitytype and classname Map<String, String> normalizerClassnames = new HashMap<String, String>(); // the #POST statements as entitytype and classname Map<String, Set<String>> posterClassnames = new HashMap<String, Set<String>>(); // the #TAXO statements as entitytype and taxonomy string Map<String, String> taxos = new HashMap<String, String>(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(patFile.openStream(), "UTF-8")); } catch (UnsupportedEncodingException e1) { LOGGER.error("Error when opening pattern file", e1); return; } catch (IOException e1) { LOGGER.error("Error when opening pattern file", e1); return; } String line = null; String[] fields; while (true) { try { line = reader.readLine(); } catch (IOException e) { LOGGER.error("Error when reading pattern file.", e); return; } if (line == null) { break; } line = line.trim(); // Is it a define statement? if (line.startsWith("#DEFINE")) { // line should be // #DEFINE<tab><defineName><tab><definePattern> fields = line.split("[\t ]+", 3); defines.put(fields[1].trim(), fields[2].trim()); } else if (line.startsWith("#RULE")) {// Is it a rule statement? // line should be // #RULE<tab><entityType><tab><rule_fam><tab><rule_name><tab><pattern> fields = line.split("[\t ]+", 5); String type = fields[1].trim(); String fam = fields[2].trim(); String ruleName = fields[3].trim(); String rulePattern = fields[4].trim(); RegexRule tmpRule = new RegexRule(); tmpRule.setEntityType(type); tmpRule.setRuleFamily(fam); tmpRule.setRuleName(ruleName); tmpRule.setPatternString(rulePattern); rules.add(tmpRule); types.add(type); } else if (line.startsWith("#NORM")) { fields = line.split("[\t ]+", 3); String type = fields[1].trim(); normalizerClassnames.put(type, fields[2].trim()); } else if (line.startsWith("#TAXO")) { fields = line.split("[\t ]+", 3); String type = fields[1].trim(); taxos.put(type, fields[2].trim()); } else if (line.startsWith("#POST")) { fields = line.split("[\t ]+", 3); String type = fields[1].trim(); String posterName = fields[2].trim(); if (!posterClassnames.containsKey(posterName)) { posterClassnames.put(posterName, new HashSet<String>()); } posterClassnames.get(posterName).add(type); } // Ignore everything else } // end file read loop try { if (reader != null) { reader.close(); } } catch (IOException e) { LOGGER.error("Error when closing pattern file.", e); } // defines,rules and classes should be completely populated // substitute all uses of DEFINE patterns within a RULE // with the DEFINE pattern surrounded by a numbered capture group for (RegexRule r : rules) { String tmpRulePattern = r.getPatternString(); Matcher elementMatcher = elementPattern.matcher(tmpRulePattern); // find all of the element definitions within the pattern int groupNum = 1; // add the entity type as the name for group 0 (whole match) r.getElementMap().put(0, r.getEntityType()); // find and replace any DEFINEd patterns, keeping track of the group // number // first find any DEFINEd patterns and stick it and its group number // into the // rule element map while (elementMatcher.find()) { int elementStart = elementMatcher.start(); int elementEnd = elementMatcher.end(); String elementName = tmpRulePattern.substring(elementStart + 1, elementEnd - 1); r.getElementMap().put(groupNum, elementName); groupNum++; } // now, replace each of the DEFINEd patterns with its regex // equivalent // wrapped in ( ), so it becomes a numbered capture group for (String tmpDefineName : defines.keySet()) { String tmpDefinePattern = "(" + defines.get(tmpDefineName) + ")"; tmpDefineName = "<" + tmpDefineName + ">"; tmpRulePattern = tmpRulePattern.replace(tmpDefineName, tmpDefinePattern); } // set the modified pattern on the rule and create the Pattern from // it r.setModifedPatternString(tmpRulePattern); r.setPattern(Pattern.compile(r.getModifedPatternString())); // resolve and attach the normalizer object if (normalizerClassnames.containsKey(r.getEntityType())) { String normClassName = normalizerClassnames.get(r.getEntityType()); Normalizer normer; try { normer = (Normalizer) Class.forName(normClassName).newInstance(); } catch (InstantiationException e) { normer = new NoOpNormalizer(); LOGGER.error("Cannot instantiate a " + normClassName + ", using a No Op normalizer instead.", e); } catch (IllegalAccessException e) { normer = new NoOpNormalizer(); LOGGER.error( "Cannot access a " + normClassName + " to create one, using a No Op normalizer instead.", e); } catch (ClassNotFoundException e) { normer = new NoOpNormalizer(); LOGGER.error("Normalizer Class " + normClassName + " not found,using a No Op normalizer instead.", e); } catch (java.lang.ClassCastException e) { normer = new NoOpNormalizer(); LOGGER.error("Class " + normClassName + " is not a Normalizer,using a No Op normalizer instead.", e); } r.setNormalizer(normer); } else { // nothing in file use NoOpNormalzer r.setNormalizer(new NoOpNormalizer()); } // resolve and attach the taxonomic string object if (taxos.containsKey(r.getEntityType())) { r.setTaxo(taxos.get(r.getEntityType())); } else { // nothing in file r.setTaxo(""); } } // end rule loop // create the postprocessors for (String p : posterClassnames.keySet()) { PostProcessor pstr; try { pstr = (PostProcessor) Class.forName(p).newInstance(); } catch (InstantiationException e) { pstr = new NoOpPostProcessor(); LOGGER.error("Cannot instantiate a " + p + ", using a No Op PostProcessor instead.", e); } catch (IllegalAccessException e) { pstr = new NoOpPostProcessor(); LOGGER.error("Cannot access a " + p + ", using a No Op PostProcessor instead.", e); } catch (ClassNotFoundException e) { pstr = new NoOpPostProcessor(); LOGGER.error("Class " + p + " is not a PostProcessor, using a No Op PostProcessor instead.", e); } posters.put(pstr, posterClassnames.get(p)); } isInited = true; } /** End initialize. */ public void initialize(File patFile) { try { initialize(patFile.toURI().toURL()); } catch (MalformedURLException e) { LOGGER.error("Cannot initialize the matcher using pattern file " + patFile.getName(), e); } } public List<RegexRule> getRules() { return rules; } public Set<String> getTypes() { return types; } }