/******************************************************************************* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.csniper.webapp.analysis.uima; import static org.apache.uima.util.CasCreationUtils.mergeTypeSystems; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createPrimitive; import static org.apache.uima.fit.factory.TypePrioritiesFactory.createTypePriorities; import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.SystemUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.CAS; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.TypePriorities; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.util.CasCreationUtils; import org.apache.wicket.spring.injection.annot.SpringBean; import org.dom4j.io.SAXContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import de.tudarmstadt.ukp.csniper.webapp.support.uima.AnalysisEngineFactory; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; //import de.tudarmstadt.ukp.csniper.textmarker.TextmarkerDescriptorCreator; //import de.uniwue.tm.textmarker.engine.TextMarkerEngine; //import de.tudarmstadt.ukp.dkpro.core.stanford.tsurgeon.TsurgeonTransformer; //import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser; //import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; public class ParsingPipeline implements Serializable { private static final long serialVersionUID = 3411626870840060929L; private static final int TRANSFORMATION_TREGEX = 0; private static final String TEXTMARKER_FILENAME = "tmScript"; private static final String TREGEX_FILENAME = "tregexScript.xml"; private File output_html; private File output_dump; private String CLASSPATH; private String TEXTMARKER_BASE; private String TREGEX_BASE; private String PACKAGE; @SpringBean(name = "customAnalysisEngineFactory") private AnalysisEngineFactory aef; public ParsingPipeline() { try { PACKAGE = this.getClass().getPackage().getName(); CLASSPATH = ResourceUtils.getUrlAsFile( ResourceUtils.resolveLocation("classpath:/", this, null), false) .getAbsolutePath(); TEXTMARKER_BASE = CLASSPATH + "/textmarker"; TREGEX_BASE = CLASSPATH + "/tregex"; output_html = File.createTempFile("output", ".html"); output_dump = File.createTempFile("output", ".txt"); } catch (IOException e) { e.printStackTrace(); } } // public ParsingPipeline(String parser, String language, String // scriptingEngine, // String inputText, String script, String[] markedAnnotations, String[] // markerColors) // { // try { // PACKAGE = this.getClass().getPackage().getName(); // = // "de.tudarmstadt.ukp.experiments.erik.wicket" // // CLASSPATH = System.getProperty("loewe.ncc.home"); // CLASSPATH = // ResourceUtils.getUrlAsFile(ResourceUtils.resolveLocation("classpath:/", // this, null), false).getAbsolutePath(); // TEXTMARKER_BASE = CLASSPATH + "/textmarker"; // TREGEX_BASE = CLASSPATH + "/tregex"; // // output_html = File.createTempFile("output", ".html"); // output_dump = File.createTempFile("output", ".txt"); // output_png = File.createTempFile("output", ".png"); // // TODO resolve markedAnnotations so that short names can be used in // // scripts instead of fully-qualified names // // // shortNames = new HashMap<String,Type>(); // // Iterator<Type> ti = ts.getTypeIterator(); // // while(ti.hasNext()) { // // Type t = ti.next(); // // if(!shortNames.containsKey(t.getShortName())) { // // shortNames.put(t.getShortName(), t); // // // System.out.println("OK: " + t.getName() + " [" + // // t.getShortName() + "]"); // // } else { // // System.out.println("WARNING: " + t.getName() + " [" + // // t.getShortName() + "] is already in the map as " + // // shortNames.get(t.getShortName()).getName()); // // } // // } // // // // // // if(tokenType == null) { // // System.out.println("ERROR [token]"); // // } else { // // System.out.println("GOOD [token]: " + tokenType.getName() + " [" // // + tokenType.getShortName()); // // } // // if(sentenceType == null) { // // System.out.println("ERROR [token]"); // // } else { // // System.out.println("GOOD [token]: " + sentenceType.getName() + // // " [" + sentenceType.getShortName()); // // } // // types = new ArrayList<Type>(); // // for(String shortName : markedAnnotations) { // // Type type = shortNames.get(shortName); // // if(type != null) { // // if(!shortNames.containsKey(shortName)) // // System.out.println("rofl."); // // types.add(type); // // } else { // // // TODO: some message, proceed with other types // // // throw new XYZ // // System.out.println("Type [" + shortName + // // "] could not be found (use shortname!)"); // // } // // } // // // JCas jcas = parseInput(parser, language, inputText); // // runScript(jcas, scriptingEngine, script, markedAnnotations, // markerColors); // } // catch (Exception e) { // e.printStackTrace(); // } // } public CAS parseInput(String parser, String language, String inputText) { if (inputText == null) { inputText = ""; } try { AnalysisEngine seg = AnalysisEngineFactory.createAnalysisEngine( AnalysisEngineFactory.SEGMENTER, "createSentences", true); AnalysisEngine par = AnalysisEngineFactory.createAnalysisEngine( AnalysisEngineFactory.PARSER, "language", language); // fill cas and create DocumentMetaData CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); DocumentMetaData.create(cas); cas.setDocumentText(inputText); cas.setDocumentLanguage(language); // tokenize seg.process(cas); // parse par.process(cas); return cas; } catch (Exception e) { throw new RuntimeException(e); } } // public void runScript(CAS cas, String tool, String script, String[] types, String[] colors) // { // try { // // merge old and new (script) typesystem // TypeSystemDescription scriptTsd = getTypeSystemDescription(tool, script); // TypeSystemDescription oldTsd = TypeSystemUtil.typeSystem2TypeSystemDescription(cas // .getTypeSystem()); // TypeSystemDescription merged = mergeTypeSystems(Arrays.asList(scriptTsd, oldTsd)); // // // create a new cas with the appropriate merged typesystem // CAS dest = CasCreationUtils.createCas(merged, null, null); // DocumentMetaData.create(dest); // // copy the contents of the old cas into the newly created one // CasCopier.copyCas(cas, dest, true); // // AnalysisEngine scr = getScriptingEngine(tool, script); // // AnalysisEngine cm = createPrimitive(HTMLColorMarkerConsumer.class, // HTMLColorMarkerConsumer.PARAM_MARKED_TYPES, types, // HTMLColorMarkerConsumer.PARAM_MARKER_COLORS, colors, // HTMLColorMarkerConsumer.PARAM_OUTPUT_FILE, output_html.getAbsolutePath()); // // AnalysisEngine pc = createPrimitive(SimpleDumpWriter.class, // SimpleDumpWriter.PARAM_OUTPUT_FILE, output_dump.getAbsolutePath()); // // scr.process(dest); // // cm.process(dest); // cm.collectionProcessComplete(); // // pc.process(dest); // pc.collectionProcessComplete(); // } // catch (Throwable e) { // throw new RuntimeException(e); // } // } // // private TypeSystemDescription getTypeSystemDescription(String engine, String script) // throws ResourceInitializationException, MalformedURLException // { // List<TypeSystemDescription> descriptions = new ArrayList<TypeSystemDescription>(); // descriptions.add(createTypeSystemDescription("BasicTypeSystem")); // descriptions.add(createTypeSystemDescription("InternalTypeSystem")); // descriptions.add(createTypeSystemDescription()); // // if (engine.equals("textmarker")) { // TypeSystemDescription tmTsd = TextmarkerDescriptorCreator.createTypeSystem(PACKAGE, // TEXTMARKER_FILENAME, script); // descriptions.add(tmTsd); // } // return mergeTypeSystems(descriptions); // } // // /** // * Creates an AnalysisEngineDescription according to the given identifier // * string. // * // * @param tool // * @param tmScript // * @param tregexScript // * @return the appropriate AnalysisEngine // * @throws ResourceInitializationException // * @throws IOException // * @throws SAXException // * @throws ParserConfigurationException // */ // private AnalysisEngine getScriptingEngine(String tool, String script) // throws ResourceInitializationException, IOException, SAXException, // ParserConfigurationException // { // if (tool.equals("textmarker")) { // script = "PACKAGE " + PACKAGE + ";\n" + script; // return getTextmarkerEngine(script); // } // else if (tool.equals("tregex")) { // // TODO: make more than one transformation possible // List<Transformation> transformations = new ArrayList<Transformation>(); // Transformation t = new Transformation(script); // transformations.add(t); // createTRegexScript(transformations); // AnalysisEngineDescription scriptingEngine = createPrimitiveDescription( // TsurgeonTransformer.class, TsurgeonTransformer.PARAM_CASCADING_TRANSFORMATIONS, // true, TsurgeonTransformer.PARAM_TRANSFORMATION_FILE_NAME, TREGEX_BASE // + "/transformations/" + TREGEX_FILENAME, // TsurgeonTransformer.PARAM_SAVE_SOURCE_SENTENCES, false, // TsurgeonTransformer.PARAM_SAVE_UNCHANGED_TREES, true); // // TsurgeonTransformer.PARAM_ANNOTATE_APPLIED_TRANSFORMATIONS, // // true); // // // The part starting with a splitter and ending with a merger has to // // be in its own aggregate. // // The flow controller of the aggregate has to drop the intermediate CASes. // FlowControllerDescription fcd = createFlowControllerDescription( // FixedFlowController.class, // FixedFlowController.PARAM_ACTION_AFTER_CAS_MULTIPLIER, "drop"); // AnalysisEngineDescription embedded = createAggregateDescription(fcd, scriptingEngine); // // // The final merged CAS, however, needs to be returned from the aggregate // embedded.getAnalysisEngineMetaData().getOperationalProperties() // .setOutputsNewCASes(true); // // return createAggregate(embedded); // } // else { // throw new IllegalArgumentException("tool has to be textmarker or tregex"); // } // } // // /** // * Creates a TRegex transformation xml-file from given transformations. // * // * @param transformations // * a list of transformations from which to build a script // * @throws IOException // * @throws SAXException // */ // private void createTRegexScript(List<Transformation> transformations) // throws IOException, SAXException // { // SAXContentHandler handler = new SAXContentHandler(); // // handler.startDocument(); // handler.startElement("", "transformations", "", new AttributesImpl()); // for (Transformation transformation : transformations) { // transformation.toXML(handler); // } // handler.endElement("", "transformations", ""); // handler.endDocument(); // // File f = new File(TREGEX_BASE + "/transformations/" + TREGEX_FILENAME); // FileUtils.writeStringToFile(f, handler.getDocument().asXML(), "UTF-8"); // } // // /** // * Represents a TRegex Transformation, including name, tregex and multiple operations. // * // */ // private static class Transformation // { // private final String name; // private final String tregex; // private final List<String> operations; // // public Transformation(String inputScript) // { // String[] lines = StringUtils.split(inputScript, SystemUtils.LINE_SEPARATOR); // if (lines.length < 2) { // throw new IllegalArgumentException( // "The input script has to consist of at least two lines: tregex pattern, operation."); // } // name = "transformation_1"; // tregex = lines[TRANSFORMATION_TREGEX] == null ? "" : lines[TRANSFORMATION_TREGEX]; // operations = new ArrayList<String>(); // for (int i = 1; i < lines.length; i++) { // if (lines[i] == null) { // lines[i] = ""; // } // operations.add(lines[i]); // } // } // // /** // * Attaches this transformation to the given SAXContentHandler. // * // * @param handler // * the handler on which to call the events on // * @throws SAXException // */ // public void toXML(SAXContentHandler handler) // throws SAXException // { // handler.startElement("", "transformation", "", new AttributesImpl()); // // handler.startElement("", "name", "", new AttributesImpl()); // handler.characters(name.toCharArray(), 0, name.toCharArray().length); // handler.endElement("", "name", ""); // // handler.startElement("", "tregex", "", new AttributesImpl()); // handler.startCDATA(); // handler.characters(tregex.toCharArray(), 0, tregex.toCharArray().length); // handler.endCDATA(); // handler.endElement("", "tregex", ""); // // for (String operation : operations) { // handler.startElement("", "operation", "", new AttributesImpl()); // handler.startCDATA(); // handler.characters(operation.toCharArray(), 0, operation.toCharArray().length); // handler.endCDATA(); // handler.endElement("", "operation", ""); // } // // handler.endElement("", "transformation", ""); // } // } // // /** // * Creates the TextMarker Analysis Engine. // * // * @return a TextMarker Analysis Engine // * @throws ResourceInitializationException // * @throws FileNotFoundException // * @throws SAXException // * @throws IOException // * @throws ParserConfigurationException // */ // private AnalysisEngine getTextmarkerEngine(String aScript) // throws ResourceInitializationException, FileNotFoundException, SAXException, IOException, // ParserConfigurationException // { // // Extract types declared in script // TypeSystemDescription merged = getTypeSystemDescription("textmarker", aScript); // // // Set up type priorities // TypePriorities prios = createTypePriorities(new String[] { // "de.uniwue.tm.textmarker.kernel.type.TextMarkerFrame", "uima.tcas.Annotation", // "de.uniwue.tm.textmarker.kernel.type.TextMarkerBasic" }); // // // Store script to a temporary location // File tmFile = new File(TEXTMARKER_BASE + "/script/" + TEXTMARKER_FILENAME + ".tm"); // FileUtils.writeStringToFile(tmFile, aScript, "UTF-8"); // // // Finally create the engine // return createPrimitive(TextMarkerEngine.class, merged, prios, TextMarkerEngine.MAIN_SCRIPT, // TEXTMARKER_FILENAME, TextMarkerEngine.SCRIPT_PATHS, new String[] { TEXTMARKER_BASE // + "/script/" }, TextMarkerEngine.RESOURCE_PATHS, // new String[] { TEXTMARKER_BASE + "/resources/" }, // TextMarkerEngine.ADDITIONAL_SCRIPTS, new String[0], // TextMarkerEngine.ADDITIONAL_ENGINES, new String[0], // TextMarkerEngine.CREATE_STYLE_MAP, false, // // TextMarkerEngine.CREATE_DEBUG_INFO, true, // // TODO find out what this filter does; it seems to have no // // effect on the generated annotations? // TextMarkerEngine.DEFAULT_FILTERED_TYPES, new String[] { "de.uniwue.tm.type.SPACE", // "de.uniwue.tm.type.NBSP", "de.uniwue.tm.type.BREAK", // "de.uniwue.tm.type.MARKUP" }); // } // TODO: do this in-memory, i.e. don't create a html file dump public String getHTML() { String output = ""; try { output = FileUtils.readFileToString(output_html, "UTF-8"); } catch (IOException e) { output = "An error occurred while trying to read the output html file."; } return output; } // TODO this is only a quick hack public String getColorMap(String[] markedTypes, String[] markerColors) { StringBuffer sb = new StringBuffer(); int typeCount = Math.min(markerColors.length, markedTypes.length); // Map<String, String> colors = new HashMap<String, String>(); for (int i = 0; i < typeCount; i++) { sb.append(markedTypes[i] + " is colored <span style=\"color:#DDDDDD; background-color:" + markerColors[i] + ";\">" + markerColors[i] + "</span><br />"); } return sb.toString(); } public String getDump() { String output; try { output = FileUtils.readFileToString(output_dump, "UTF-8"); } catch (IOException e) { output = "An error occurred while trying to read the analysis output file."; } return output; } }