/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.opensextant.extractors.flexpat;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.opensextant.ConfigException;
import org.opensextant.extraction.Extractor;
import org.opensextant.extraction.TextMatch;
import org.opensextant.util.TextUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author ubaldino
*/
public abstract class AbstractFlexPat implements Extractor {
/**
* CHARS. SHP DBF limit is 255 bytes, so SHP file outputters should assess
* at that time how/when to curtail match width. The max pre/post text seen
* useful has typically been about 200-250 characters.
*/
protected int match_width = 100;
protected Logger log = LoggerFactory.getLogger(getClass());
protected boolean debug = false;
protected RegexPatternManager patterns = null;
protected String patterns_file = null;
public AbstractFlexPat() {
debug = log.isDebugEnabled();
}
public AbstractFlexPat(boolean b) {
this();
debug = b;
}
/**
* Create a pattern manager given the input stream and the file name.
* @return the regex pattern manager
* @throws java.net.MalformedURLException config error
*/
protected abstract RegexPatternManager createPatternManager(InputStream s, String name)
throws IOException;
public RegexPatternManager getPatternManager() {
return patterns;
}
/**
* Configures whatever default patterns file is named.
* @throws ConfigException config error, pattern file not found
*/
@Override
public void configure() throws ConfigException {
if (patterns_file == null) {
throw new ConfigException(
"Default configure() requires you set .patterns_file with a resource path to the item");
}
configure(getClass().getResourceAsStream(patterns_file), patterns_file);
}
/**
* Configure using a particular pattern file.
*
* @param patfile a pattern file.
* @throws ConfigException if pattern file not found
*/
@Override
public void configure(String patfile) throws ConfigException {
if (patfile == null) {
throw new ConfigException("Null path not allowed. no defaults.");
}
try {
patterns = createPatternManager(new FileInputStream(patfile), patfile);
} catch (Exception loaderr) {
String msg = "Could not load patterns file FILE=" + patfile;
throw new ConfigException(msg, loaderr);
}
}
/**
* Configure using a URL pointer to the pattern file.
*
* @param patfile patterns file URL
* @throws ConfigException if pattern file not found
*/
@Override
public void configure(URL patfile) throws ConfigException {
if (patfile == null) {
throw new ConfigException("URL for pattern defs not found. ");
}
try {
patterns = createPatternManager(patfile.openStream(), patfile.getFile());
} catch (Exception loaderr) {
throw new ConfigException("Could not load patterns file URL=" + patfile, loaderr);
}
}
public void configure(InputStream strm, String name) throws ConfigException {
try {
patterns = createPatternManager(strm, name);
} catch (Exception loaderr) {
throw new ConfigException("Could not load patterns file =" + name, loaderr);
}
}
/**
* Match Width is the text buffer before and after a TextMatch. Match
* buffers are used to create a match ID
*
* @param w width
*/
public void setMatchWidth(int w) {
match_width = w;
}
/**
* Optional. Assign an identifier to each Text Match found. This is an MD5
* of the match in-situ. If context is provided, it is used to generate the
* identity. If a count is provided it is used.
*
* otherwise make use of just pattern ID + text value.
*
* @param m a TextMatch
* @param count incrementor used for uniqueness
*/
protected void set_match_id(TextMatch m, int count) {
try {
if (m.getContextBefore() == null) {
m.match_id = TextUtils.text_id(String.format("%s,%s", m.pattern_id, m.getText()));
} else if (count >= 0) {
m.match_id = TextUtils.text_id(String.format("%s,%s,%d", m.pattern_id, m.getText(),
count));
} else {
StringBuilder abc = new StringBuilder();
abc.append(m.getContextBefore());
abc.append(m.getText());
abc.append(m.getContextAfter());
m.match_id = TextUtils.text_id(abc.toString());
}
} catch (Exception hashErr) {
log.error("Rare Java cryptologic err", hashErr);
}
}
public void enableAll() {
patterns.enableAll();
}
public void disableAll() {
patterns.disableAll();
}
public void updateProgress(double progress) {
}
public void markComplete() {
}
}