/* * #%L * NICTA t3as MetaMap Tagger * %% * Copyright (C) 2014 NICTA * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-3.0.html>. * * Additional permission under GNU GPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or combining * it with H2, GWT, or JavaBeans Activation Framework (JAF) (or a * modified version of those libraries), containing parts covered by the * terms of the H2 License, the GWT Terms, or the Common Development and * Distribution License (CDDL) version 1.0 ,the licensors of this Program * grant you additional permission to convey the resulting work. * #L% */ package org.t3as.metamap; import com.google.common.collect.ImmutableList; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.t3as.metamap.options.Option; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; public final class MetaMap { private final File publicMm; private final Collection<Option> opts; public MetaMap(final File publicMm, final Collection<Option> opts) { this.publicMm = publicMm; this.opts = ImmutableList.copyOf(opts); } public boolean process(final File input, final File output) throws IOException, InterruptedException { // put the options together final List<String> o = new ArrayList<>(); o.add(publicMm.getAbsolutePath() + "/bin/metamap13"); for (final Option opt : opts) { o.add(opt.toMmOptStr()); } // set the input and output o.add("--XMLf1"); o.add(input.getAbsolutePath()); o.add(output.getAbsolutePath()); // run MetaMap to produce XML output final ProcessBuilder pb = new ProcessBuilder(); pb.command(o.toArray(new String[o.size()])); pb.directory(publicMm); pb.inheritIO(); final long start = System.currentTimeMillis(); final Process mm = pb.start(); final int exitCode = mm.waitFor(); final long end = System.currentTimeMillis(); System.out.printf("MetaMap processing finished in %,d milliseconds.\n", end - start); return exitCode == 0; } /** * Takes a Unicode string and tries to decompose non-7bit-ascii (Unicode Basic Latin) characters into 7bit ascii. * For example, the string 'âåäöốở' is turned into 'aaaooo'. * Note that it doesn't always succeed for some of the much more complicated characters (e.g. 'µ'). * Occasionally some complicated characters end up as two characters when the ASCIIFoldingFilter is used... * Perhaps we want to adopt this library: * http://www.ippatsuman.com/projects/junidecode/ */ public static String decomposeToAscii(final String s) { /* pure java version, doesn't work all the time: String normalized = Normalizer.normalize(s, Normalizer.Form.NFD); return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); */ // this works on more cases final char[] input = new char[s.length()]; s.getChars(0, s.length(), input, 0); final char[] output = new char[input.length * 4]; final int numChars = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length); // now remove anything not in the printable US-ASCII range, but keep newlines final StringBuilder sb = new StringBuilder(numChars); for (int i = 0; i < numChars; i++) { final char c = output[i]; // printable US-ASCII is from 32 to 126 if ((32 <= c && c <= 126) || '\n' == c) sb.append(c); } return sb.toString(); } }