/** * Settings class with variables and helper methods to use with TreeTaggerWrapper */ package de.unihd.dbs.uima.annotator.treetagger; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; /** * * @author Julian Zell * */ public class TreeTaggerProperties { public static final String FLUSH_SEQUENCE = "\n.\n.\n.\n.\n.\n(\n)\n.\n.\n.\n.\n"; public static final String STARTOFTEXT = "<This-is-the-start-of-the-text />"; public static final String ENDOFTEXT = "<This-is-the-end-of-the-text />"; // treetagger language name for par files public String languageName = null; // absolute path of the treetagger public String rootPath = null; // Files for tokenizer and part of speech tagger (standard values) public String tokScriptName = null; public String parFileName = null; public String abbFileName = null; // english, italian, and french tagger models require additional splits (see tagger readme) public String languageSwitch = null; // perl requires(?) special hint for utf-8-encoded input/output (see http://perldoc.perl.org/perlrun.html#Command-Switches -C) // The input text is read in HeidelTimeStandalone.java and always translated into UTF-8, // i.e., switch always "-CSD" public String utf8Switch = "-CSD"; // save System-specific separators for string generation public String newLineSeparator = System.getProperty("line.separator"); public String fileSeparator = System.getProperty("file.separator"); // chinese tokenizer path public File chineseTokenizerPath = null; /** * This method creates a process with some parameters for the tokenizer script. * * Deprecated: We use TreeTaggerTokenizer in the same package nowadays which implements the utf8-tokenize.perl * script from the TreeTagger package. This fixes some issues with Perl's Unicode handling. * @param inputFile * @return * @throws IOException */ @Deprecated public Process getTokenizationProcess(File inputFile) throws IOException { // assemble a command line for the tokenization script and execute it ArrayList<String> command = new ArrayList<String>(); command.add("perl"); if(this.utf8Switch != "") command.add(this.utf8Switch); command.add(this.rootPath + this.fileSeparator + "cmd" + this.fileSeparator + this.tokScriptName); if(this.languageSwitch != "") command.add(this.languageSwitch); if(new File(this.rootPath + this.fileSeparator + "lib" + this.fileSeparator, this.abbFileName).exists()) { command.add("-a"); command.add(this.rootPath + this.fileSeparator + "lib" + this.fileSeparator + this.abbFileName); } command.add(inputFile.getAbsolutePath()); String[] commandStr = new String[command.size()]; command.toArray(commandStr); Process p = Runtime.getRuntime().exec(commandStr); return p; } public Process getChineseTokenizationProcess() throws IOException { // assemble a command line for the tokenization script and execute it ArrayList<String> command = new ArrayList<String>(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(this.chineseTokenizerPath, "segment-zh.pl")))); String segmenterScript = ""; String buf = null; Boolean firstLine = true; // this dirty hack is to force the script to autoflush its buffers. thanks, PERL while((buf = br.readLine()) != null) { // we omit comments if(buf.startsWith("#")) continue; // set the lexicon files if(buf.startsWith("$lexicon=")) buf = "$lexicon=\"" + new File(this.chineseTokenizerPath, "lcmc-uni2.dat").getAbsolutePath().replaceAll("\\\\", "/") + "\";"; if(buf.startsWith("$lexicon2=")) buf = "$lexicon2=\"" + new File(this.chineseTokenizerPath, "lcmc-bigrams2.dat").getAbsolutePath().replaceAll("\\\\", "/") + "\";"; buf = buf.replaceAll("\"", "'"); buf = buf.replaceAll("'\\\\n'", "chr(10)"); // add the autoflush variable if(firstLine) { segmenterScript += "$| = 1;"; firstLine = false; } segmenterScript += buf; } br.close(); command.add("perl"); command.add("-X"); command.add("-e"); command.add(segmenterScript); String[] commandStr = new String[command.size()]; command.toArray(commandStr); ProcessBuilder builder = new ProcessBuilder(commandStr); builder.directory(this.chineseTokenizerPath); return builder.start(); } public Process getTreeTaggingProcess(File inputFile) throws IOException { // assemble a command line based on configuration and execute the POS tagging. ArrayList<String> command = new ArrayList<String>(); command.add(this.rootPath + this.fileSeparator + "bin" + this.fileSeparator + "tree-tagger"); command.add(this.rootPath + this.fileSeparator + "lib" + this.fileSeparator + this.parFileName); command.add(inputFile.getAbsolutePath()); command.add("-no-unknown"); String[] commandStr = new String[command.size()]; command.toArray(commandStr); return Runtime.getRuntime().exec(commandStr); } public Process getTreeTaggingProcess() throws IOException { // assemble a command line based on configuration and execute the POS tagging. ArrayList<String> command = new ArrayList<String>(); command.add(this.rootPath + this.fileSeparator + "bin" + this.fileSeparator + "tree-tagger"); command.add(this.rootPath + this.fileSeparator + "lib" + this.fileSeparator + this.parFileName); command.add("-no-unknown"); command.add("-quiet"); command.add("-token"); command.add("-sgml"); String[] commandStr = new String[command.size()]; command.toArray(commandStr); ProcessBuilder builder = new ProcessBuilder(commandStr); builder.redirectErrorStream(true); return builder.start(); } }