//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.resources.gazetteer;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import org.apache.uima.resource.Resource;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.resources.SharedFileResource;
/**
* Read from a file as the back-end of a gazetteer
*
*
*/
public class FileGazetteer extends AbstractMultiMapGazetteer<Integer> {
public static final String CONFIG_FILE = "fileName";
public static final String CONFIG_TERM_SEPARATOR = "termSeparator";
private File file;
private String termSeparator = ",";
/**
* Configure a new instance of FileGazetteer. The following config parameters are expected/allowed:
* <ul>
* <li><b>fileName</b> - What file are we using for the gazetteer; defaults to gazetteer.txt</li>
* <li><b>termSeparator</b> - The string that separates aliases of the same entity on a single line in the gazetteer. Defaults to ","</li>
* </ul>
*
* @param connection A SharedFileResource object to read the file with
* @param config A map of additional configuration options
*/
@Override
public void init(Resource connection, Map<String, Object> config) throws BaleenException {
if (config.containsKey(CONFIG_FILE)) {
file = new File(config.get(CONFIG_FILE).toString());
} else {
file = new File("gazetteer.txt");
}
if (!file.exists() || !file.canRead()) {
throw new InvalidParameterException("Unable to read file " + file.getPath());
}
super.init(connection, config);
}
@Override
public void destroy() {
file = null;
super.destroy();
}
@Override
public void reloadValues() throws BaleenException {
reset();
String[] content;
try {
content = SharedFileResource.readFileLines(file);
} catch (IOException e) {
throw new BaleenException(e);
}
int lineNumber = 0;
for (String line : content) {
lineNumber++;
if (line.trim().isEmpty()) {
continue;
}
if (!caseSensitive) {
line = line.toLowerCase();
}
String[] termsArray = line.split(termSeparator);
for (String t : termsArray) {
if (t.trim().isEmpty()) {
continue;
}
addTerm(lineNumber, t.trim());
}
}
}
}