/*
* HeidelTimeStandalone.java
*
* Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* authors: Andreas Fay, Jannik Strötgen
* email: fay@stud.uni-heidelberg.de, stroetgen@uni-hd.de
*
* HeidelTime is a multilingual, cross-domain temporal tagger.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.heideltime.standalone;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.uima.UIMAFramework;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.XMLInputSource;
import de.unihd.dbs.heideltime.standalone.components.JCasFactory;
import de.unihd.dbs.heideltime.standalone.components.ResultFormatter;
import de.unihd.dbs.heideltime.standalone.components.PartOfSpeechTagger;
import de.unihd.dbs.heideltime.standalone.components.impl.AllLanguagesTokenizerWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.HunPosTaggerWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.IntervalTaggerWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.JCasFactoryImpl;
import de.unihd.dbs.heideltime.standalone.components.impl.JVnTextProWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.StanfordPOSTaggerWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.TimeMLResultFormatter;
import de.unihd.dbs.heideltime.standalone.components.impl.TreeTaggerWrapper;
import de.unihd.dbs.heideltime.standalone.components.impl.UimaContextImpl;
import de.unihd.dbs.heideltime.standalone.components.impl.XMIResultFormatter;
import de.unihd.dbs.heideltime.standalone.exceptions.DocumentCreationTimeMissingException;
import de.unihd.dbs.uima.annotator.heideltime.HeidelTime;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceScanner;
import de.unihd.dbs.uima.annotator.intervaltagger.IntervalTagger;
import de.unihd.dbs.uima.types.heideltime.Dct;
/**
* Execution class for UIMA-Component HeidelTime. Singleton-Pattern
*
* @author Andreas Fay, Jannik Strötgen, Heidelberg Universtiy
* @version 1.01
*/
public class HeidelTimeStandalone {
/**
* Used document type
*/
private DocumentType documentType;
/**
* HeidelTime instance
*/
private HeidelTime heidelTime;
/**
* Type system description of HeidelTime
*/
private JCasFactory jcasFactory;
/**
* Used language
*/
private Language language;
/**
* output format
*/
private OutputType outputType;
/**
* POS tagger
*/
private POSTagger posTagger;
/**
* Whether or not to do Interval Tagging
*/
private Boolean doIntervalTagging;
/**
* Logging engine
*/
private static Logger logger = Logger.getLogger("HeidelTimeStandalone");
/**
* empty constructor.
*
* call initialize() after using this!
*
* @param language
* @param typeToProcess
* @param outputType
*/
public HeidelTimeStandalone() {
}
/**
* constructor
* @param language
* @param typeToProcess
* @param outputType
*/
public HeidelTimeStandalone(Language language, DocumentType typeToProcess, OutputType outputType) {
this(language, typeToProcess, outputType, null);
}
/**
* Constructor with configPath. Used primarily for WebUI
*
* @param language
* @param typeToProcess
* @param outputType
* @param configPath
*/
public HeidelTimeStandalone(Language language, DocumentType typeToProcess, OutputType outputType, String configPath) {
this.language = language;
this.documentType = typeToProcess;
this.outputType = outputType;
this.initialize(language, typeToProcess, outputType, configPath);
}
/**
* Constructor with configPath
*
* @param language
* @param typeToProcess
* @param outputType
* @param configPath
* @param posTagger
*/
public HeidelTimeStandalone(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger) {
this.language = language;
this.documentType = typeToProcess;
this.outputType = outputType;
this.initialize(language, typeToProcess, outputType, configPath, posTagger);
}
/**
* Constructor with configPath
*
* @param language
* @param typeToProcess
* @param outputType
* @param configPath
* @param posTagger
*/
public HeidelTimeStandalone(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger, Boolean doIntervalTagging) {
this.language = language;
this.documentType = typeToProcess;
this.outputType = outputType;
this.doIntervalTagging = doIntervalTagging;
this.initialize(language, typeToProcess, outputType, configPath, posTagger, doIntervalTagging);
}
/**
* Method that initializes all vital prerequisites
*
* @param language Language to be processed with this copy of HeidelTime
* @param typeToProcess Domain type to be processed
* @param outputType Output type
* @param configPath Path to the configuration file for HeidelTimeStandalone
*/
public void initialize(Language language, DocumentType typeToProcess, OutputType outputType, String configPath) {
initialize(language, typeToProcess, outputType, configPath, POSTagger.TREETAGGER);
}
/**
* Method that initializes all vital prerequisites, including POS Tagger
*
* @param language Language to be processed with this copy of HeidelTime
* @param typeToProcess Domain type to be processed
* @param outputType Output type
* @param configPath Path to the configuration file for HeidelTimeStandalone
* @param posTagger POS Tagger to use for preprocessing
*/
public void initialize(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger) {
initialize(language, typeToProcess, outputType, configPath, posTagger, false);
}
/**
* Method that initializes all vital prerequisites, including POS Tagger
*
* @param language Language to be processed with this copy of HeidelTime
* @param typeToProcess Domain type to be processed
* @param outputType Output type
* @param configPath Path to the configuration file for HeidelTimeStandalone
* @param posTagger POS Tagger to use for preprocessing
* @param doIntervalTagging Whether or not to invoke the IntervalTagger
*/
public void initialize(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger, Boolean doIntervalTagging) {
logger.log(Level.INFO, "HeidelTimeStandalone initialized with language " + this.language.getName());
// set the POS tagger
this.posTagger = posTagger;
// set doIntervalTagging flag
this.doIntervalTagging = doIntervalTagging;
// read in configuration in case it's not yet initialized
if(!Config.isInitialized()) {
if(configPath == null)
readConfigFile(CLISwitch.CONFIGFILE.getValue().toString());
else
readConfigFile(configPath);
}
try {
heidelTime = new HeidelTime();
heidelTime.initialize(new UimaContextImpl(language, typeToProcess, CLISwitch.VERBOSITY2.getIsActive()));
logger.log(Level.INFO, "HeidelTime initialized");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "HeidelTime could not be initialized");
}
// Initialize JCas factory -------------
logger.log(Level.FINE, "Initializing JCas factory...");
try {
TypeSystemDescription[] descriptions = new TypeSystemDescription[] {
UIMAFramework
.getXMLParser()
.parseTypeSystemDescription(
new XMLInputSource(
this.getClass()
.getClassLoader()
.getResource(
Config.get(Config.TYPESYSTEMHOME)))) };
jcasFactory = new JCasFactoryImpl(descriptions);
logger.log(Level.INFO, "JCas factory initialized");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "JCas factory could not be initialized");
}
}
/**
* Runs the IntervalTagger on the JCAS object.
* @param jcas jcas object
*/
private void runIntervalTagger(JCas jcas) {
logger.log(Level.FINEST, "Running Interval Tagger...");
Integer beforeAnnotations = jcas.getAnnotationIndex().size();
// Prepare the options for IntervalTagger's execution
Properties settings = new Properties();
settings.put(IntervalTagger.PARAM_LANGUAGE, language.getResourceFolder());
settings.put(IntervalTagger.PARAM_INTERVALS, true);
settings.put(IntervalTagger.PARAM_INTERVAL_CANDIDATES, false);
// Instantiate and process with IntervalTagger
IntervalTaggerWrapper iTagger = new IntervalTaggerWrapper();
iTagger.initialize(settings);
iTagger.process(jcas);
// debug output
Integer afterAnnotations = jcas.getAnnotationIndex().size();
logger.log(Level.FINEST, "Annotation delta: " + (afterAnnotations - beforeAnnotations));
}
/**
* Provides jcas object with document creation time if
* <code>documentCreationTime</code> is not null.
*
* @param jcas
* @param documentCreationTime
* @throws DocumentCreationTimeMissingException
* If document creation time is missing when processing a
* document of type {@link DocumentType#NEWS}.
*/
private void provideDocumentCreationTime(JCas jcas,
Date documentCreationTime)
throws DocumentCreationTimeMissingException {
if (documentCreationTime == null) {
// Document creation time is missing
if (documentType == DocumentType.NEWS) {
// But should be provided in case of news-document
throw new DocumentCreationTimeMissingException();
}
if (documentType == DocumentType.COLLOQUIAL) {
// But should be provided in case of colloquial-document
throw new DocumentCreationTimeMissingException();
}
} else {
// Document creation time provided
// Translate it to expected string format
SimpleDateFormat dateFormatter = new SimpleDateFormat(
"yyyy.MM.dd'T'HH:mm");
String formattedDCT = dateFormatter.format(documentCreationTime);
// Create dct object for jcas
Dct dct = new Dct(jcas);
dct.setValue(formattedDCT);
dct.addToIndexes();
}
}
/**
* Establishes preconditions for jcas to be processed by HeidelTime
*
* @param jcas
*/
private void establishHeidelTimePreconditions(JCas jcas) {
// Token information & sentence structure
establishPartOfSpeechInformation(jcas);
}
/**
* Establishes part of speech information for cas object.
*
* @param jcas
*/
private void establishPartOfSpeechInformation(JCas jcas) {
logger.log(Level.FINEST, "Establishing part of speech information...");
PartOfSpeechTagger partOfSpeechTagger = null;
Properties settings = new Properties();
switch (language) {
case ARABIC:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Arabic. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new StanfordPOSTaggerWrapper();
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_MODEL_PATH, Config.get(Config.STANFORDPOSTAGGER_MODEL_PATH));
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_CONFIG_PATH, Config.get(Config.STANFORDPOSTAGGER_CONFIG_PATH));
}
break;
case VIETNAMESE:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Vietnamese. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new JVnTextProWrapper();
settings.put(PartOfSpeechTagger.JVNTEXTPRO_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.JVNTEXTPRO_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.JVNTEXTPRO_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.JVNTEXTPRO_WORD_MODEL_PATH, Config.get(Config.JVNTEXTPRO_WORD_MODEL_PATH));
settings.put(PartOfSpeechTagger.JVNTEXTPRO_SENT_MODEL_PATH, Config.get(Config.JVNTEXTPRO_SENT_MODEL_PATH));
settings.put(PartOfSpeechTagger.JVNTEXTPRO_POS_MODEL_PATH, Config.get(Config.JVNTEXTPRO_POS_MODEL_PATH));
}
break;
case CROATIAN:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Croatian. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new HunPosTaggerWrapper();
settings.put(PartOfSpeechTagger.HUNPOS_LANGUAGE, language);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.HUNPOS_MODEL_PATH, Config.get(Config.HUNPOS_MODEL_PATH));
}
break;
default:
if(POSTagger.STANFORDPOSTAGGER.equals(posTagger)) {
partOfSpeechTagger = new StanfordPOSTaggerWrapper();
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_MODEL_PATH, Config.get(Config.STANFORDPOSTAGGER_MODEL_PATH));
settings.put(PartOfSpeechTagger.STANFORDPOSTAGGER_CONFIG_PATH, Config.get(Config.STANFORDPOSTAGGER_CONFIG_PATH));
} else if(POSTagger.TREETAGGER.equals(posTagger)) {
partOfSpeechTagger = new TreeTaggerWrapper();
settings.put(PartOfSpeechTagger.TREETAGGER_LANGUAGE, language);
settings.put(PartOfSpeechTagger.TREETAGGER_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.TREETAGGER_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.TREETAGGER_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.TREETAGGER_IMPROVE_GERMAN_SENTENCES, (language == Language.GERMAN));
settings.put(PartOfSpeechTagger.TREETAGGER_CHINESE_TOKENIZER_PATH, Config.get(Config.CHINESE_TOKENIZER_PATH));
} else if(POSTagger.HUNPOS.equals(posTagger)) {
partOfSpeechTagger = new HunPosTaggerWrapper();
settings.put(PartOfSpeechTagger.HUNPOS_LANGUAGE, language);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_TOKENS, true);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_POS, true);
settings.put(PartOfSpeechTagger.HUNPOS_ANNOTATE_SENTENCES, true);
settings.put(PartOfSpeechTagger.HUNPOS_MODEL_PATH, Config.get(Config.HUNPOS_MODEL_PATH));
} else if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for the selected language. "
+ "If proper preprocessing for the specified language (." + language.getName() + ") is available, this might results in better "
+ "temporal tagging quality.");
} else {
logger.log(Level.FINEST, "Sorry, but you can't use that tagger.");
}
}
partOfSpeechTagger.initialize(settings);
partOfSpeechTagger.process(jcas);
partOfSpeechTagger.reset();
logger.log(Level.FINEST, "Part of speech information established");
}
private ResultFormatter getFormatter() {
if (outputType.toString().equals("xmi")){
return new XMIResultFormatter();
} else {
return new TimeMLResultFormatter();
}
}
/**
* Processes document with HeidelTime
*
* @param document
* @return Annotated document
* @throws DocumentCreationTimeMissingException
* If document creation time is missing when processing a
* document of type {@link DocumentType#NEWS}. Use
* {@link #process(String, Date)} instead to provide document
* creation time!
*/
public String process(String document)
throws DocumentCreationTimeMissingException {
return process(document, null, getFormatter());
}
/**
* Processes document with HeidelTime
*
* @param document
* @return Annotated document
* @throws DocumentCreationTimeMissingException
* If document creation time is missing when processing a
* document of type {@link DocumentType#NEWS}. Use
* {@link #process(String, Date)} instead to provide document
* creation time!
*/
public String process(String document, Date documentCreationTime)
throws DocumentCreationTimeMissingException {
return process(document, documentCreationTime, getFormatter());
}
/**
* Processes document with HeidelTime
*
* @param document
* @return Annotated document
* @throws DocumentCreationTimeMissingException
* If document creation time is missing when processing a
* document of type {@link DocumentType#NEWS}. Use
* {@link #process(String, Date)} instead to provide document
* creation time!
*/
public String process(String document, ResultFormatter resultFormatter)
throws DocumentCreationTimeMissingException {
return process(document, null, resultFormatter);
}
/**
* Processes document with HeidelTime
*
* @param document
* @param documentCreationTime
* Date when document was created - especially important if
* document is of type {@link DocumentType#NEWS}
* @return Annotated document
* @throws DocumentCreationTimeMissingException
* If document creation time is missing when processing a
* document of type {@link DocumentType#NEWS}
*/
public String process(String document, Date documentCreationTime, ResultFormatter resultFormatter)
throws DocumentCreationTimeMissingException {
logger.log(Level.INFO, "Processing started");
// Generate jcas object ----------
logger.log(Level.FINE, "Generate CAS object");
JCas jcas = null;
try {
jcas = jcasFactory.createJCas();
jcas.setDocumentText(document);
logger.log(Level.FINE, "CAS object generated");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "Cas object could not be generated");
}
// Process jcas object -----------
try {
logger.log(Level.FINER, "Establishing preconditions...");
provideDocumentCreationTime(jcas, documentCreationTime);
establishHeidelTimePreconditions(jcas);
logger.log(Level.FINER, "Preconditions established");
heidelTime.process(jcas);
logger.log(Level.INFO, "Processing finished");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "Processing aborted due to errors");
}
// process interval tagging ---
if(doIntervalTagging)
runIntervalTagger(jcas);
// Process results ---------------
logger.log(Level.FINE, "Formatting result...");
// PrintAnnotations.printAnnotations(jcas.getCas(), System.out);
String result = null;
try {
result = resultFormatter.format(jcas);
logger.log(Level.INFO, "Result formatted");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "Result could not be formatted");
}
return result;
}
/**
* @param args
*/
public static void main(String[] args) {
String docPath = null;
for(int i = 0; i < args.length; i++) { // iterate over cli parameter tokens
if(args[i].startsWith("-")) { // assume we found a switch
// get the relevant enum
CLISwitch sw = CLISwitch.getEnumFromSwitch(args[i]);
if(sw == null) { // unsupported CLI switch
logger.log(Level.WARNING, "Unsupported switch: "+args[i]+". Quitting.");
System.exit(-1);
}
if(sw.getHasFollowingValue()) { // handle values for switches
if(args.length > i+1 && !args[i+1].startsWith("-")) { // we still have an array index after this one and it's not a switch
sw.setValue(args[++i]);
} else { // value is missing or malformed
logger.log(Level.WARNING, "Invalid or missing parameter after "+args[i]+". Quitting.");
System.exit(-1);
}
} else { // activate the value-less switches
sw.setValue(null);
}
} else { // assume we found the document's path/name
docPath = args[i];
}
}
// display help dialog if HELP-switch is given
if(CLISwitch.HELP.getIsActive()) {
printHelp();
System.exit(0);
}
// start off with the verbosity recognition -- lots of the other
// stuff can be skipped if this is set too high
if(CLISwitch.VERBOSITY2.getIsActive()) {
logger.setLevel(Level.ALL);
logger.log(Level.INFO, "Verbosity: '-vv'; Logging level set to ALL.");
// output the found language resource folders
String languagesList = "";
for(String language : ResourceScanner.getInstance().getDetectedResourceFolders()) {
languagesList += System.getProperty("line.separator") + "- " + language;
}
logger.log(Level.INFO, "Listing detected language folders:" + languagesList);
} else if(CLISwitch.VERBOSITY.getIsActive()) {
logger.setLevel(Level.INFO);
logger.log(Level.INFO, "Verbosity: '-v'; Logging level set to INFO and above.");
} else {
logger.setLevel(Level.WARNING);
logger.log(Level.INFO, "Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above.");
}
// Check input encoding
String encodingType = null;
if(CLISwitch.ENCODING.getIsActive()) {
encodingType = CLISwitch.ENCODING.getValue().toString();
logger.log(Level.INFO, "Encoding '-e': "+encodingType);
} else {
// Encoding type not found
encodingType = CLISwitch.ENCODING.getValue().toString();
logger.log(Level.INFO, "Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'");
}
// Check output format
OutputType outputType = null;
if(CLISwitch.OUTPUTTYPE.getIsActive()) {
outputType = OutputType.valueOf(CLISwitch.OUTPUTTYPE.getValue().toString().toUpperCase());
logger.log(Level.INFO, "Output '-o': "+outputType.toString().toUpperCase());
} else {
// Output type not found
outputType = (OutputType) CLISwitch.OUTPUTTYPE.getValue();
logger.log(Level.INFO, "Output '-o': NOT FOUND OR RECOGNIZED; set to "+outputType.toString().toUpperCase());
}
// Check language
Language language = null;
if(CLISwitch.LANGUAGE.getIsActive()) {
language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());
if(language == Language.WILDCARD && !ResourceScanner.getInstance().getDetectedResourceFolders().contains(language.getName())) {
logger.log(Level.SEVERE, "Language '-l': "+CLISwitch.LANGUAGE.getValue()+" NOT RECOGNIZED; aborting.");
printHelp();
System.exit(-1);
} else {
logger.log(Level.INFO, "Language '-l': "+language.getName());
}
} else {
// Language not found
language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());
logger.log(Level.INFO, "Language '-l': NOT FOUND; set to "+language.toString().toUpperCase());
}
// Check type
DocumentType type = null;
if(CLISwitch.DOCTYPE.getIsActive()) {
try {
if(CLISwitch.DOCTYPE.getValue().equals("narrative")) { // redirect "narrative" to "narratives"
CLISwitch.DOCTYPE.setValue("narratives");
}
type = DocumentType.valueOf(CLISwitch.DOCTYPE.getValue().toString().toUpperCase());
} catch(IllegalArgumentException e) {
logger.log(Level.WARNING, "Type '-t': NOT RECOGNIZED. These are the available options: " + Arrays.asList(DocumentType.values()));
System.exit(-1);
}
logger.log(Level.INFO, "Type '-t': "+type.toString().toUpperCase());
} else {
// Type not found
type = (DocumentType) CLISwitch.DOCTYPE.getValue();
logger.log(Level.INFO, "Type '-t': NOT FOUND; set to "+type.toString().toUpperCase());
}
// Check document creation time
Date dct = null;
if(CLISwitch.DCT.getIsActive()) {
try {
DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
dct = formatter.parse(CLISwitch.DCT.getValue().toString());
logger.log(Level.INFO, "Document Creation Time '-dct': "+dct.toString());
} catch (Exception e) {
// DCT was not parseable
logger.log(Level.WARNING, "Document Creation Time '-dct': NOT RECOGNIZED. Quitting.");
printHelp();
System.exit(-1);
}
} else {
if ((type == DocumentType.NEWS) || (type == DocumentType.COLLOQUIAL)) {
// Dct needed
dct = (Date) CLISwitch.DCT.getValue();
logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; set to local date ("
+ dct.toString() + ").");
} else {
logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; skipping.");
}
}
// Handle locale switch
String locale = (String) CLISwitch.LOCALE.getValue();
Locale myLocale = null;
if(CLISwitch.LOCALE.getIsActive()) {
// check if the requested locale is available
for(Locale l : Locale.getAvailableLocales()) {
if(l.toString().toLowerCase().equals(locale.toLowerCase()))
myLocale = l;
}
try {
Locale.setDefault(myLocale); // try to set the locale
logger.log(Level.INFO, "Locale '-locale': "+myLocale.toString());
} catch(Exception e) { // if the above fails, spit out error message and available locales
logger.log(Level.WARNING, "Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
logger.log(Level.WARNING, Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales
printHelp();
System.exit(-1);
}
} else {
// no -locale parameter supplied: just show default locale
logger.log(Level.INFO, "Locale '-locale': NOT FOUND, set to environment locale: "+Locale.getDefault().toString());
}
// Read configuration from file
String configPath = CLISwitch.CONFIGFILE.getValue().toString();
try {
logger.log(Level.INFO, "Configuration path '-c': "+configPath);
readConfigFile(configPath);
logger.log(Level.FINE, "Config initialized");
} catch (Exception e) {
e.printStackTrace();
logger.log(Level.WARNING, "Config could not be initialized! Please supply the -c switch or "
+ "put a config.props into this directory.");
printHelp();
System.exit(-1);
}
// Set the preprocessing POS tagger
POSTagger posTagger = null;
if(CLISwitch.POSTAGGER.getIsActive()) {
try {
posTagger = POSTagger.valueOf(CLISwitch.POSTAGGER.getValue().toString().toUpperCase());
} catch(IllegalArgumentException e) {
logger.log(Level.WARNING, "Given POS Tagger doesn't exist. Please specify a valid one as listed in the help.");
printHelp();
System.exit(-1);
}
logger.log(Level.INFO, "POS Tagger '-pos': "+posTagger.toString().toUpperCase());
} else {
// Type not found
posTagger = (POSTagger) CLISwitch.POSTAGGER.getValue();
logger.log(Level.INFO, "POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to "+posTagger.toString().toUpperCase());
}
// Set whether or not to use the Interval Tagger
Boolean doIntervalTagging = false;
if(CLISwitch.INTERVALS.getIsActive()) {
doIntervalTagging = CLISwitch.INTERVALS.getIsActive();
logger.log(Level.INFO, "Interval Tagger '-it': " + doIntervalTagging.toString());
} else {
logger.log(Level.INFO, "Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to " + doIntervalTagging.toString());
}
// make sure we have a document path
if (docPath == null) {
logger.log(Level.WARNING, "No input file given; aborting.");
printHelp();
System.exit(-1);
}
// Run HeidelTime
RandomAccessFile aFile = null;
MappedByteBuffer buffer = null;
FileChannel inChannel = null;
PrintWriter pwOut = null;
try {
logger.log(Level.INFO, "Reading document using charset: " + encodingType);
aFile = new RandomAccessFile(docPath, "r");
inChannel = aFile.getChannel();
buffer = inChannel.map(FileChannel.MapMode.READ_ONLY, 0, inChannel.size());
buffer.load();
byte[] inArr = new byte[(int) inChannel.size()];
for(int i = 0; i < buffer.limit(); i++) {
inArr[i] = buffer.get();
}
// double-newstring should not be necessary, but without this, it's not running on Windows (?)
String input = new String(new String(inArr, encodingType).getBytes("UTF-8"), "UTF-8");
HeidelTimeStandalone standalone = new HeidelTimeStandalone(language, type, outputType, null, posTagger, doIntervalTagging);
String out = standalone.process(input, dct);
// Print output always as UTF-8
pwOut = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
pwOut.println(out);
} catch (Exception e) {
e.printStackTrace();
} finally {
if(pwOut != null) {
pwOut.close();
}
if(buffer != null) {
buffer.clear();
}
if(inChannel != null) {
try {
inChannel.close();
} catch (IOException e) { }
}
if(aFile != null) {
try {
aFile.close();
} catch (IOException e) { }
}
}
}
public static void readConfigFile(String configPath) {
InputStream configStream = null;
try {
logger.log(Level.INFO, "trying to read in file "+configPath);
configStream = new FileInputStream(configPath);
Properties props = new Properties();
props.load(configStream);
Config.setProps(props);
configStream.close();
} catch (FileNotFoundException e) {
logger.log(Level.WARNING, "couldn't open configuration file \""+configPath+"\". quitting.");
System.exit(-1);
} catch (IOException e) {
logger.log(Level.WARNING, "couldn't close config file handle");
e.printStackTrace();
}
}
private static void printHelp() {
String path = HeidelTimeStandalone.class.getProtectionDomain().getCodeSource().getLocation().getFile();
String filename = path.substring(path.lastIndexOf(System.getProperty("file.separator")) + 1);
System.out.println("HeidelTime Standalone");
System.out.println("Copyright © 2011-2016 Jannik Strötgen");
System.out.println("This software is free. See the COPYING file for copying conditions.");
System.out.println();
System.out.println("Usage:");
System.out.println(" java -jar "
+ filename
+ " <input-document> [-param1 <value1> ...]");
System.out.println();
System.out.println("Parameters and expected values:");
for(CLISwitch c : CLISwitch.values()) {
System.out.println(" "
+ c.getSwitchString()
+ "\t"
+ ((c.getSwitchString().length() > 4)? "" : "\t")
+ c.getName()
);
if(c == CLISwitch.LANGUAGE) {
System.out.print("\t\t" + "Available languages: [ ");
for(Language l : Language.values())
if(l != Language.WILDCARD)
System.out.print(l.getName().toLowerCase()+" ");
System.out.println("]");
}
if(c == CLISwitch.POSTAGGER) {
System.out.print("\t\t" + "Available taggers: [ ");
for(POSTagger p : POSTagger.values())
System.out.print(p.toString().toLowerCase()+" ");
System.out.println("]");
}
if(c == CLISwitch.DOCTYPE) {
System.out.print("\t\t" + "Available types: [ ");
for(DocumentType t : DocumentType.values())
System.out.print(t.toString().toLowerCase()+" ");
System.out.println("]");
}
}
System.out.println();
}
public DocumentType getDocumentType() {
return documentType;
}
public void setDocumentType(DocumentType documentType) {
this.documentType = documentType;
}
public Language getLanguage() {
return language;
}
public void setLanguage(Language language) {
this.language = language;
}
public OutputType getOutputType() {
return outputType;
}
public void setOutputType(OutputType outputType) {
this.outputType = outputType;
}
public final POSTagger getPosTagger() {
return posTagger;
}
public final void setPosTagger(POSTagger posTagger) {
this.posTagger = posTagger;
}
}