/**
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*
* Continue contributions:
* Copyright 2013-2015 The MITRE Corporation.
*/
///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____ ____ __ __
///\ __`\ /\ _`\ /\ \__ /\ \__
//\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\
//\ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/
//\ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
// \ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
// \/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/
// \ \_\
// \/_/
//
// OpenSextant PoLi - Patterns extractor
//* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//*/
package org.opensextant.extractors.poli;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import org.opensextant.data.TextInput;
import org.opensextant.extraction.TextMatch;
import org.opensextant.extractors.flexpat.AbstractFlexPat;
import org.opensextant.extractors.flexpat.RegexPattern;
import org.opensextant.extractors.flexpat.RegexPatternManager;
import org.opensextant.extractors.flexpat.TextMatchResult;
import org.opensextant.util.TextUtils;
/**
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class PatternsOfLife extends AbstractFlexPat {
public static final String DEFAULT_POLI_CFG = "/poli_patterns.cfg";
public PatternsOfLife(boolean debugmode) {
super(debugmode);
patterns_file = DEFAULT_POLI_CFG;
}
/**
* Default constructor, debugging off.
*/
public PatternsOfLife() {
this(false);
}
/**
* Extractor interface: extractors are responsible for cleaning up after
* themselves.
*/
@Override
public void cleanup() {
}
/**
* Extractor interface: getName
*
* @return
*/
@Override
public String getName() {
return "PoLi";
}
@Override
protected RegexPatternManager createPatternManager(InputStream s, String n) throws IOException {
return new PoliPatternManager(s, n);
}
/**
* Support the standard Extractor interface. This provides access to the
* most common extraction; For PoLi extraction, you would process ALL
* patterns in your configuration file, or if you enable only certain
* patterns -- those enabled at the time of this call would be executed.
* extract_patterns( family = null ) implies ALL patterns.
*/
@Override
public List<TextMatch> extract(TextInput input) {
TextMatchResult results = extract_patterns(input.buffer, input.id, null);
return results.matches;
}
public List<TextMatch> extract(TextInput input, String family) {
TextMatchResult results = extract_patterns(input.buffer, input.id, family);
return results.matches;
}
@Override
public List<TextMatch> extract(String input_buf) {
TextMatchResult results = extract_patterns(input_buf, NO_DOC_ID, null);
return results.matches;
}
/**
* Extract patterns of a certain family from a block of text.
*
* @param text
* - data to process
* @param text_id
* - identifier for the data
* @param family
* - optional filter; to reuse the same PatManager but extract
* certain patterns only.
*
* @return PoliResult
*/
public TextMatchResult extract_patterns(String text, String text_id, String family) {
TextMatchResult results = new TextMatchResult();
results.result_id = text_id;
results.matches = new ArrayList<TextMatch>();
int bufsize = text.length();
PoliMatch poliMatch = null;
int found = 0;
int patternsComplete = 0;
for (RegexPattern repat : patterns.get_patterns()) {
if (!repat.enabled) {
continue;
}
if (family != null && !repat.id.startsWith(family)) {
continue;
}
Matcher match = repat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
Map<String, String> fields = patterns.group_map(repat, match);
if (repat.match_class == null) {
poliMatch = new PoliMatch(fields, match.group());
} else {
try {
poliMatch = (PoliMatch) repat.match_class.newInstance();
poliMatch.setText(match.group());
poliMatch.setGroups(fields);
} catch (InstantiationException classErr1) {
poliMatch = null;
log.error("Could not create... ", classErr1);
} catch (IllegalAccessException classErr2) {
poliMatch = null;
log.error("Could not create... ", classErr2);
}
}
if (poliMatch == null) {
// This would have been thrown at init.
log.error("Could not find pattern family for " + repat.id);
continue;
}
poliMatch.setType(repat.family);
poliMatch.pattern_id = repat.id;
poliMatch.start = match.start();
poliMatch.end = match.end();
poliMatch.normalize();
// Filter -- trivial filter is to filter out any coord that
// cannot
// TODO: Assess filters?
// returns indices for window around text match
int[] slices = TextUtils.get_text_window(poliMatch.start, bufsize, match_width);
// left l1 to left l2
poliMatch.setContext(TextUtils.delete_eol(text.substring(slices[0], slices[1])));
set_match_id(poliMatch, found);
results.matches.add(poliMatch);
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
results.pass = !results.matches.isEmpty();
PoliPatternManager.reduce_matches(results.matches);
return results;
}
}