/**
*
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*
* @author dlutz, MITRE creator (lutzdavp)
* @author ubaldino, MITRE adaptor
*/
package org.opensextant.extractors.flexpat;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.opensextant.extraction.TextEntity;
import org.opensextant.extraction.TextMatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <p > This is the culmination of various date/time extraction efforts in python
* and Java. This API poses no assumptions on input data or on execution.
* Features of REGEX patterns file:
* <ul>
* <li>DEFINE - a component of a pattern to match</li>
* <li>RULE - a complete pattern to match</li>
* </ul>
* This work started in Java 6 and has the limitation of Java 6 Regex, mainly that there are no named groups available in matching.
*
* <p >See XCoord PatternManager for a good example implementation.
*
*/
public abstract class RegexPatternManager {
protected Logger log = LoggerFactory.getLogger(getClass());
/**
*
*/
protected Map<String, RegexPattern> patterns = null;
/**
*
*/
protected List<RegexPattern> patterns_list = null;
protected String patternFile = null;
/**
*
*/
public boolean debug = log.isDebugEnabled();
/**
*
*/
public boolean testing = false;
/**
*
*/
public List<PatternTestCase> testcases = new ArrayList<PatternTestCase>();
public RegexPatternManager(InputStream s, String n) throws IOException {
this.patternFile = n;
testing = this.debug;
initialize(s);
}
/**
*
* @return collection of patterns
*/
public Collection<RegexPattern> get_patterns() {
return patterns_list;
}
/**
* Access the paterns by ID
*
* @param id pattern id
* @return found pattern or null
*/
public RegexPattern get_pattern(String id) {
return patterns.get(id);
}
/**
* Implementation must create a RegexPattern given the basic RULE define,
* #RULE FAMILY RID REGEX PatternManager here adds compiled pattern and
* DEFINES.
*
* @param fam family
* @param rule rule ID within the family
* @param desc optional description
* @return pattern object
*/
protected abstract RegexPattern create_pattern(String fam, String rule, String desc);
/**
* Implementation has the option to check a pattern; For now invalid
* patterns are only logged.
*
* @param pat pattern object
* @return true if pattern is valid
*/
protected abstract boolean validate_pattern(RegexPattern pat);
/**
* Implementation must create TestCases given the #TEST directive, #TEST RID
* TID TEXT
*
* @param id pattern id
* @param fam pattern family
* @param text text for test case
* @return test case object
*/
protected abstract PatternTestCase create_testcase(String id, String fam, String text);
/**
* enable an instance of a pattern based on the global settings.
*
* @param p the pattern obj to enable
*/
public abstract void enable_pattern(RegexPattern p);
/**
* default adapter -- you must override. This should be abstract, but not
* all pattern managers are required to support this.
*
* @param name pattern name to enable.
*/
public void enable_patterns(String name) {
//throw new Exception("not implemented");
}
/**
* Enable a family of patterns
*/
public void disableAll() {
for (RegexPattern pat : patterns.values()) {
pat.enabled = false;
}
}
public void enableAll() {
for (RegexPattern pat : patterns.values()) {
pat.enabled = true;
}
}
private StringBuilder configMessages = new StringBuilder();
/**
* Initializes the pattern manager implementations. Reads the DEFINEs and
* RULEs from the pattern file and does the requisite substitutions. After
* initialization patterns HashMap will be populated.
*
* @throws IOException if patterns file can not be loaded and parsed
*/
public void initialize(InputStream io) throws IOException {
patterns = new HashMap<String, RegexPattern>();
patterns_list = new ArrayList<RegexPattern>();
// the #DEFINE statements as name and regex
HashMap<String, String> defines = new HashMap<String, String>();
// the #RULE statements as name and a sequence of DEFINES and regex bits
HashMap<String, String> rules = new HashMap<String, String>();
HashMap<String, String> matcherClasses = new HashMap<String, String>();
List<String> rule_order = new ArrayList<String>();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(io, "UTF-8"))) {
String _line = null;
String[] fields;
int testcount = 0;
while ((_line = reader.readLine()) != null) {
String line = _line.trim();
// Is it a define statement?
if (line.startsWith("#DEFINE")) {
// line should be
// #DEFINE<tab><defineName><tab><definePattern>
fields = line.split("[\t ]+", 3);
defines.put(fields[1].trim(), fields[2].trim());
} // Is it a rule statement?
else if (line.startsWith("#RULE")) {
// line should be
// #RULE<tab><rule_fam><tab><rule_id><tab><pattern>
fields = line.split("[\t ]+", 4);
String fam = fields[1].trim();
String ruleName = fields[2].trim();
String rulePattern = fields[3].trim();
// geoform + ruleName should be unique, use as key in rules
// table
String ruleKey = fam + "-" + ruleName;
// if already a rule by that name, error
if (rules.containsKey(ruleKey)) {
// log.error("Duplicate rule name " + ruleName);
throw new IOException("FlexPat Config Error - Duplicate rule name " + ruleName);
} else {
rules.put(ruleKey, rulePattern);
rule_order.add(ruleKey);
}
} else if (testing & line.startsWith("#TEST")) {
fields = line.split("[\t ]+", 4);
++testcount;
String fam = fields[1].trim();
String ruleName = fields[2].trim();
String testtext = fields[3].trim().replace("$NL", "\n");
String ruleKey = fam + "-" + ruleName;
// testcount is a count of all tests, not just test within a rule family
//testcases.add(new PatternTestCase(ruleKey + "#" + testcount, fam, testtext));
testcases.add(create_testcase(ruleKey + "#" + testcount, fam, testtext));
} else if (line.startsWith("#CLASS")) {
fields = line.split("[\t ]+", 3);
String fam = fields[1].trim();
matcherClasses.put(fam, fields[2].trim());
}
// Ignore everything else
} // end file read loop
} // try-finally closes reader.
// defines and rules should be completely populated
// substitute all uses of DEFINE patterns within a RULE
// with the DEFINE pattern surrounded by a capture group
// populate the group names Hashmap with a key made from the rule name
// and group index
// the pattern of a DEFINE within a RULE e.g "<somePiece>"
String elementRegex = "<[a-zA-Z0-9_]+>";
Pattern elementPattern = Pattern.compile(elementRegex);
for (String tmpKey : rule_order) {
String tmpRulePattern = rules.get(tmpKey);
// the key should be of the form <geoform>_<rulename>
String[] pieces = tmpKey.split("-", 2);
String tmpFam = pieces[0];
String tmpRuleName = pieces[1];
Matcher elementMatcher = elementPattern.matcher(tmpRulePattern);
// find all of the element definitions within the pattern
int groupNum = 1;
if (debug) {
configMessages.append("\nrulename=" + tmpRuleName);
configMessages.append(", rulepattern=" + tmpRulePattern);
}
RegexPattern pat = create_pattern(tmpFam, tmpRuleName, "No Description yet...");
if (matcherClasses.containsKey(tmpFam)) {
pat.match_classname = matcherClasses.get(tmpFam);
try {
pat.match_class = Class.forName(pat.match_classname);
} catch (ClassNotFoundException err) {
throw new IOException("FlexPat initialization failed due to invalid classname", err);
}
}
// find and replace the DEFINEd pattern
while (elementMatcher.find()) {
int elementStart = elementMatcher.start();
int elementEnd = elementMatcher.end();
String elementName = tmpRulePattern.substring(elementStart + 1, elementEnd - 1);
pat.regex_groups.add(elementName);
//groupNames.put(tmpRuleName + "-" + groupNum, elementName);
if (debug) {
String subelementPattern = defines.get(elementName);
configMessages.append("\n\t");
configMessages.append(groupNum + " " + elementName + " = " + subelementPattern);
}
groupNum++;
}
for (String tmpDefineName : defines.keySet()) {
// NOTE: Use of parens, "(expr)", is required to create groups within a pattern.
String tmpDefinePattern = "(" + defines.get(tmpDefineName) + ")";
tmpDefineName = "<" + tmpDefineName + ">";
// use replace(tok, sub) not replaceAll(re, sub)
tmpRulePattern = tmpRulePattern.replace(tmpDefineName, tmpDefinePattern);
}
if (debug) {
configMessages.append("\nrulepattern=" + tmpRulePattern);
}
//MCU: slash simplified.
//tmpRulePattern = tmpRulePattern.replaceAll("\\", "\\\\");
// at this point rule pattern should have had defines replaced
// compile and insert into pattern hashmap
pat.regex = Pattern.compile(tmpRulePattern.toString(), Pattern.CASE_INSENSITIVE);
enable_pattern(pat);
patterns_list.add(pat);
patterns.put(pat.id, pat);
if (!validate_pattern(pat)) {
throw new IOException("Invalid Pattern @ " + pat.toString());
}
}
if (debug) {
configMessages.append("\nFound # of PATTERNS=" + patterns.values().size());
for (RegexPattern pat : patterns_list) {
configMessages.append("\n");
configMessages.append(pat.id + "\t" + pat.regex.pattern());
}
}
}// end initialize
/**
* Instead of relying on a logging API, we now throw Exceptionsages for real
* configuration errors, and capture configuration details in a buffer if
* debug is on.
*
* @return the configuration debug
*/
public String getConfigurationDebug() {
if (!debug) {
return "Debug not enabled; Try again, set .debug = true";
}
return configMessages.toString();
}
/**
* NOTE: We're dealing with Java6's inability to use named groups. So we have to
* track FlexPat slots in line with Matcher fields matched. Essentially this comes down to
* a simple Name:Offset pairing; our limitation here is no nesting.
*
* @param p pattern
* @param matched matcher
* @return map containing the matched groups, as deciphered by Flexpat and the definitions in the patterns file
*/
public Map<String, String> group_map(RegexPattern p, java.util.regex.Matcher matched) {
Map<String, String> pairs = new HashMap<String, String>();
int cnt = matched.groupCount();
for (int x = 0; x < cnt; ++x) {
// Put the matcher group in a hash with an appropriate name.
String nm = p.regex_groups.get(x);
pairs.put(nm, matched.group(x + 1));
}
return pairs;
}
/**
* Matched fields as TextEntities
*
* @param p the p
* @param matched the matched
* @return the map
*/
public Map<String, TextEntity> group_matches(RegexPattern p, java.util.regex.Matcher matched) {
Map<String, TextEntity> pairs = new HashMap<String, TextEntity>();
int cnt = matched.groupCount();
for (int x = 0; x < cnt; ++x) {
// Put the matcher group in a hash with an appropriate name.
String nm = p.regex_groups.get(x);
TextEntity e = new TextEntity();
e.setText(matched.group(x + 1));
e.start = matched.start(x + 1);
pairs.put(nm, e);
}
return pairs;
}
/**
* This operates on the listed objects, flagging each match as distinct, overlapping with other match or if it is completely contained within other match.
* @param matches a list of related matches from a single text
*/
public static void reduce_matches(List<TextMatch> matches) {
int len = matches.size();
for (int i = 0; i < len; ++i) {
TextMatch M = matches.get(i);
long m1 = M.start;
long m2 = M.end;
// Compare from
for (int j = i + 1; j < len; ++j) {
TextMatch N = matches.get(j);
long n1 = N.start;
long n2 = N.end;
if (m2 < n1) {
// M before N entirely
continue;
}
if (m1 > n2) {
// M after N entirely
continue;
}
// Same span, but duplicate.
if (n1 == m1 && n2 == m2) {
N.is_duplicate = true;
M.is_overlap = true;
continue;
}
// M entirely within N
if (n1 <= m1 && m2 <= n2) {
M.is_submatch = true;
N.is_overlap = true;
continue;
}
// N entirely within M
if (n1 >= m1 && m2 >= n2) {
M.is_overlap = true;
N.is_submatch = true;
continue;
}
// Overlapping spans
M.is_overlap = true;
N.is_overlap = true;
}
}
}
}