/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.imscwb; import static org.apache.commons.io.FileUtils.deleteQuietly; import static org.apache.commons.io.FileUtils.forceMkdir; import static org.apache.commons.io.FileUtils.listFiles; import static org.apache.commons.io.FilenameUtils.removeExtension; import static org.apache.commons.lang.StringUtils.join; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * This Consumer outputs the content of all CASes into the IMS workbench format. * * This writer produces a text file which needs to be converted to the binary IMS CWB index files * using the command line tools that come with the CWB. * * It is possible to set the parameter {@link #PARAM_CQP_HOME} to directly create output in the * native binary CQP format via the original CWB command line tools. * */ @MimeTypeCapability({MimeTypes.TEXT_X_IMSCWB}) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) public class ImsCwbWriter extends JCasAnnotator_ImplBase { public static final String E_SENTENCE = "s"; public static final String E_TEXT = "text"; public static final String E_DOCUMENT = "document"; public static final String ATTR_BEGIN = "begin"; public static final String ATTR_END = "end"; public static final String ATTR_POS = "pos"; public static final String ATTR_CPOS = "cpos"; public static final String ATTR_LEMMA = "lemma"; public static final String ATTR_ID = "id"; public static final String ATTR_URI = "uri"; /** * Location to which the output is written. */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File outputFile; /** * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; /** * Write the document ID for each token. It is usually a better idea to generate a * {@link #PARAM_WRITE_DOCUMENT_TAG document tag} or a {@link #PARAM_WRITE_TEXT_TAG text tag} * which also contain the document ID that can be queried in CQP. */ public static final String PARAM_WRITE_DOC_ID = "writeDocId"; @ConfigurationParameter(name = PARAM_WRITE_DOC_ID, mandatory = true, defaultValue = "false") private boolean writeDocId; /** * Write part-of-speech tags. */ public static final String PARAM_WRITE_POS = "writePOS"; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePOS; /** * Write coarse-grained part-of-speech tags. These are the simple names of the UIMA types used * to represent the part-of-speech tag. */ public static final String PARAM_WRITE_CPOS = "writeCPOS"; @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "false") private boolean writeCPOS; /** * Write lemmata. */ public static final String PARAM_WRITE_LEMMA = "writeLemma"; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; /** * Write a pseudo-XML tag with the name {@code document} to mark the start and end of a * document. */ public static final String PARAM_WRITE_DOCUMENT_TAG = "writeDocumentTag"; @ConfigurationParameter(name = PARAM_WRITE_DOCUMENT_TAG, mandatory = true, defaultValue = "false") private boolean writeDocumentTag; /** * Write a pseudo-XML tag with the name {@code text} to mark the start and end of a document. * This is used by CQPweb. */ public static final String PARAM_WRITE_TEXT_TAG = "writeTextTag"; @ConfigurationParameter(name = PARAM_WRITE_TEXT_TAG, mandatory = true, defaultValue = "true") private boolean writeTextTag; /** * Write the start and end position of each token. */ public static final String PARAM_WRITE_OFFSETS = "writeOffsets"; @ConfigurationParameter(name = PARAM_WRITE_OFFSETS, mandatory = true, defaultValue = "false") private boolean writeOffsets; /** * Write additional token-level annotation features. These have to be given as an array of fully * qualified feature paths (fully.qualified.classname/featureName). The names for these * annotations in CQP are their lowercase shortnames. */ public static final String PARAM_ADDITIONAL_FEATURES = "additionalFeatures"; @ConfigurationParameter(name = PARAM_ADDITIONAL_FEATURES, mandatory = false) private String[] additionalFeatures; /** * Make document IDs compatible with CQPweb. CQPweb demands an id consisting of only letters, * numbers and underscore. */ public static final String PARAM_CQPWEB_COMPATIBILITY = "cqpwebCompatibility"; @ConfigurationParameter(name = PARAM_CQPWEB_COMPATIBILITY, mandatory = true, defaultValue = "false") private boolean cqpwebCompatibility; /** * Set this parameter to the directory containing the cwb-encode and cwb-makeall commands if you * want the write to directly encode into the CQP binary format. */ public static final String PARAM_CQP_HOME = "cqpHome"; @ConfigurationParameter(name = PARAM_CQP_HOME, mandatory = false) private File cqpHome; /** * Set this parameter to compress the token streams and the indexes using cwb-huffcode and * cwb-compress-rdx. With modern hardware, this may actually slow down queries, so we turn it * off by default. If you have large data sets, you best try yourself what works best for you. * (default: false) */ public static final String PARAM_CQP_COMPRESS = "cqpCompress"; @ConfigurationParameter(name = PARAM_CQP_COMPRESS, mandatory = true, defaultValue = "false") private boolean cqpCompress; /** * The name of the generated corpus. */ public static final String PARAM_CORPUS_NAME = "corpusName"; @ConfigurationParameter(name = PARAM_CORPUS_NAME, mandatory = true, defaultValue = "corpus") private String corpusName; public static final String PARAM_SENTENCE_TAG = "sentenceTag"; @ConfigurationParameter(name = PARAM_SENTENCE_TAG, mandatory = true, defaultValue = E_SENTENCE) private String sentenceTag; private static final String LS = "\n"; private static final String TAB = "\t"; private Writer bw; private int currentId; private Process childProcess; private File dataDirectory; private File registryDirectory; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { File parentFile = outputFile.getParentFile(); if (parentFile != null) { forceMkdir(parentFile); } } catch (IOException e) { throw new ResourceInitializationException(e); } try { bw = getWriter(); } catch (IOException e) { throw new ResourceInitializationException(e); } currentId = 0; } @Override public void process(JCas jcas) throws AnalysisEngineProcessException { String documentId = DocumentMetaData.get(jcas).getDocumentId(); String documentUri = DocumentMetaData.get(jcas).getDocumentUri(); // CQPweb demands an id consisting of only letters, numbers and underscore if (cqpwebCompatibility) { // if the documentTag is written as well keep the id, else use the uri instead if (writeDocumentTag) { if (documentId == null || documentId.length() == 0) { documentId = Integer.toString(currentId); } documentId = documentId.replaceAll("[^\\d\\w_]", "_"); } else { if (documentUri == null || documentUri.length() == 0) { documentUri = Integer.toString(currentId); } documentId = documentUri.replaceAll("[^\\d\\w_]", "_"); } } try { if (writeTextTag) { startElement(E_TEXT, ATTR_ID, documentId); } if (writeDocumentTag) { startElement(E_DOCUMENT, ATTR_URI, documentUri); } for (Sentence sentence : select(jcas, Sentence.class)) { attendChildProceess(); startElement(sentenceTag); for (Token token : selectCovered(jcas, Token.class, sentence)) { // write token bw.write(escapeXml(token.getCoveredText())); // write pos tag if (writePOS) { field(token.getPos() != null ? token.getPos().getPosValue() : "-"); } // write coarse grained pos tag if (writeCPOS) { field(token.getPos() != null ? token.getPos().getType().getShortName() : "-"); } // write lemma if (writeLemma) { field(token.getLemma() != null ? token.getLemma().getValue() : "-"); } // write doc-id if (writeDocId) { field(documentId); } // write offsets if (writeOffsets) { field(String.valueOf(token.getBegin())); field(String.valueOf(token.getEnd())); } // write additional tags if (additionalFeatures != null) { // try { for (String featurePath : additionalFeatures) { String val = getCoveredAnnotationFeatureValue(featurePath, token); field(val); } // } } bw.write(LS); } endElement(sentenceTag); } if (writeDocumentTag) { endElement(E_DOCUMENT); } if (writeTextTag) { endElement(E_TEXT); } currentId++; } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } private void startElement(String aElement, String... aAttributes) throws IOException { bw.write('<'); bw.write(aElement); if (aAttributes != null && aAttributes.length > 0) { bw.write(" "); for (int i = 0; i < aAttributes.length; i += 2) { bw.write(aAttributes[i]); bw.write("=\""); bw.write(escapeXml(aAttributes[i + 1])); bw.write('"'); } } bw.write('>'); bw.write(LS); } private void endElement(String aElement) throws IOException { bw.write("</"); bw.write(aElement); bw.write('>'); bw.write(LS); } private void field(String aValue) throws IOException { bw.write(TAB); bw.write(escapeXml(aValue)); } private Writer getWriter() throws IOException { if (cqpHome != null) { dataDirectory = new File(outputFile, "data"); registryDirectory = new File(outputFile, "registry"); forceMkdir(dataDirectory); forceMkdir(registryDirectory); List<String> cmd = new ArrayList<String>(); cmd.add(new File(cqpHome, "cwb-encode").getAbsolutePath()); cmd.add("-c"); cmd.add(getCwbCharset(encoding)); // -x XML-aware (replace XML entities and ignore <!.. and <?..) cmd.add("-x"); // -s skip empty lines in input data (recommended) cmd.add("-s"); // -B strip leading/trailing blanks from (input lines & token annotations) cmd.add("-B"); // -d <dir> directory for data files created by ./cwb-encode cmd.add("-d"); cmd.add(dataDirectory.getPath()); // -R <rf> create registry entry (named <rf>) listing all encoded attributes cmd.add("-R"); cmd.add(new File(registryDirectory, corpusName).getPath()); // -P <att> declare additional p-attribute <att> if (writePOS) { cmd.add("-P"); cmd.add(ATTR_POS); } if (writeCPOS) { cmd.add("-P"); cmd.add(ATTR_CPOS); } if (writeLemma) { cmd.add("-P"); cmd.add(ATTR_LEMMA); } if (writeDocId) { cmd.add("-P"); cmd.add(ATTR_URI); } if (writeOffsets) { cmd.add("-P"); cmd.add(ATTR_BEGIN); cmd.add("-P"); cmd.add(ATTR_END); } if (additionalFeatures != null) { for (String featurePath : additionalFeatures) { String[] segments = featurePath.split("/", 2); if (segments.length != 2) { throw new IllegalArgumentException("Given feature path is malformed: [" + featurePath + "] (exactly one \"/\" (slash) must exist)."); } String typeName = segments[0]; String featureName = segments.length > 1 ? segments[1] : ""; String name = (StringUtils.substringAfterLast(typeName, ".") + "_" + featureName) .toLowerCase(); cmd.add("-P"); cmd.add(name); } } if (writeDocumentTag) { cmd.add("-S"); cmd.add(E_DOCUMENT + ":0+" + ATTR_URI); } if (writeTextTag) { cmd.add("-S"); cmd.add(E_TEXT + ":0+" + ATTR_ID); } { cmd.add("-S"); cmd.add(sentenceTag + ":0"); } getLogger().info("Spawning cwb-encode: " + join(cmd, " ")); final ProcessBuilder pb = new ProcessBuilder(); pb.command(cmd); childProcess = pb.start(); return new OutputStreamWriter(childProcess.getOutputStream(), encoding); } else { return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoding)); } } private void attendChildProceess() { if (childProcess != null) { try { InputStream stdout = childProcess.getInputStream(); if (stdout.available() > 0) { byte[] data = new byte[stdout.available()]; stdout.read(data); getLogger().info(new String(data, "UTF-8")); } InputStream stderr = childProcess.getErrorStream(); if (stderr.available() > 0) { byte[] data = new byte[stderr.available()]; stderr.read(data); getLogger().error(new String(data, "UTF-8")); } } catch (IOException e) { getLogger().error("Unable to communicate with child process"); } } } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { IOUtils.closeQuietly(bw); if (childProcess != null) { try { childProcess.waitFor(); attendChildProceess(); childProcess = null; } catch (InterruptedException e) { throw new AnalysisEngineProcessException(e); } runCwbCommand("cwb-makeall", "-r", registryDirectory.getPath(), "-V", corpusName.toUpperCase()); if (cqpCompress) { // Compress the token sequence of a positional attribute. Creates .huf, .hcd, // and .huf.syn files, which replace the corresponding .corpus files. After // running this tool successfully, the .corpus files can be deleted. runCwbCommand("cwb-huffcode", "-r", registryDirectory.getPath(), "-A", corpusName.toUpperCase()); for (File f : listFiles(dataDirectory, new String[] { "huf" }, false)) { deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus")); } // Compress the index of a positional attribute. Creates .crc and .crx files // which replace the corresponding .corpus.rev and .corpus.rdx files. After // running this tool successfully, the latter files can be deleted. runCwbCommand("cwb-compress-rdx", "-r", registryDirectory.getPath(), "-A", corpusName.toUpperCase()); for (File f : listFiles(dataDirectory, new String[] { "crc" }, false)) { deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rev")); deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rdx")); } } } } private void runCwbCommand(String aCommand, String... aArguments) throws AnalysisEngineProcessException { try { List<String> args = new ArrayList<String>(aArguments.length + 1); args.add(new File(cqpHome, aCommand).getAbsolutePath()); for (String arg : aArguments) { args.add(arg); } ProcessBuilder pb = new ProcessBuilder(args); getLogger().info("Spawning " + aCommand + ": " + join(args, " ")); childProcess = pb.start(); childProcess.waitFor(); } catch (InterruptedException e) { throw new AnalysisEngineProcessException(e); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } finally { attendChildProceess(); childProcess = null; } } private static Map<String, String> CHARSET_MAPPING = new HashMap<String, String>(); static { CHARSET_MAPPING.put("ISO-8859-1", "latin1"); CHARSET_MAPPING.put("UTF-8", "utf8"); } private static String getCwbCharset(String aEncoding) { String enc = CHARSET_MAPPING.get(aEncoding); if (enc == null) { throw new IllegalArgumentException("Encoding [" + enc + "] not supported by CWB."); } return enc; } private static String escapeXml(String aString) { return aString.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">") .replaceAll("\"", """).replaceAll("'", "'"); } /** * Get the feature value of an annotation which is covered by another annotation. * * @param aFeaturePath * The fully qualified feature path of the feature in question: * your.package.and.annotation.class.name/featureName * @param aCoveringAnnotation * The annotation that covers the annotation for which the feature value should be * extracted. * @return the feature value if a feature name is given; coveredText if only the annotation is * given */ public String getCoveredAnnotationFeatureValue(String aFeaturePath, AnnotationFS aCoveringAnnotation) { String[] segments = aFeaturePath.split("/", 2); if (segments.length != 2) { throw new IllegalArgumentException("Given feature path is malformed: [" + aFeaturePath + "] (exactly one \"/\" (slash) must exist)."); } String typeName = segments[0]; String featureName = segments[1]; Type type = CasUtil.getAnnotationType(aCoveringAnnotation.getCAS(), typeName); Feature feature = type.getFeatureByBaseName(featureName); if (feature == null) { throw new IllegalArgumentException("Feature [" + featureName + "] is not defined for type [" + type + "] (check lower/uppercase spelling)."); } List<AnnotationFS> covered = CasUtil.selectCovered(type, aCoveringAnnotation); switch (covered.size()) { case 0: if (getLogger().isWarnEnabled()) { getLogger().warn( "There is no annotation of type [" + typeName + "] available which is covered by [" + aCoveringAnnotation + "], returning empty string."); } return ""; case 1: return covered.get(0).getFeatureValueAsString(feature); default: if (getLogger().isWarnEnabled()) { getLogger().warn( "There are multiple annotations of type [" + typeName + "] available which are covered by [" + aCoveringAnnotation + "], returning the first."); } return covered.get(0).getFeatureValueAsString(feature); } } }