/******************************************************************************* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.csniper.webapp.search.cqp; import static de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit.ATTR_BEGIN; import static de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit.ATTR_END; import static de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit.ATTR_ID; import static de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit.E_SENTENCE; import static de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit.E_TEXT; import static org.apache.commons.lang.StringUtils.join; import static org.apache.commons.lang.StringUtils.substringAfterLast; import static org.apache.commons.lang.StringUtils.substringBeforeLast; import java.io.BufferedReader; import java.io.Closeable; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.dao.DataAccessException; import org.springframework.dao.DataAccessResourceFailureException; import org.springframework.dao.InvalidDataAccessResourceUsageException; import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.EvaluationItem; import de.tudarmstadt.ukp.csniper.webapp.search.PreparedQuery; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; /** * This class provides a super-slimmed down API for CQP (although you should be able to use most CQP * commands via the exec() function). * * @author Erik-Lân Do Dinh * */ public class CqpQuery implements PreparedQuery, Closeable { private final Log log = LogFactory.getLog(getClass()); private final CqpEngine engine; private final String type; private final String corpus; private String macrosLocation; private int leftContext = 5; private int rightContext = 5; private ContextUnit contextUnit = ContextUnit.WORD; private String leftDelim = "--%%%--"; private String rightDelim = "--%%%--"; private List<String> error; private String version; private static final String CQP_VERSION_PREFIX = "CQP version "; private static final String CQP_EOL = "-::-EOL-::-"; private static final String SEP = "/"; private static final String STD_QUERY_NAME = "Q"; private boolean querySuccess = true; private Process cqpProcess; private int maxResults = 1000; private int timeout = 10 * 1000; /** * Constructs a CQPManager. */ public CqpQuery(CqpEngine aEngine, String aType, String aCorpus) { engine = aEngine; type = aType; corpus = aCorpus; if (corpus == null) { throw new InvalidDataAccessResourceUsageException("Corpus cannot be null."); } error = new ArrayList<String>(); cqpProcess = getCQPProcess(); // -- set obligatory options -- // corpus List<String> output = exec(corpus); if (output.size() > 0) { version = StringUtils.substringAfter(output.get(0), CQP_VERSION_PREFIX); } // add macro definitions if (engine.getMacrosLocation() != null) { setMacrosLocation(engine.getMacrosLocation()); } // set default delimiters (can be changed) setLeftDelim(leftDelim); setRightDelim(rightDelim); // show positional attributes send("show +" + ATTR_BEGIN); send("show +" + ATTR_END); send("set PrintStructures \"" + E_TEXT + "_" + ATTR_ID + "\""); // activate progressbar (essential, because we stop reading at EOL, which occurs after // the progress messages send("set ProgressBar on"); } public void setTimeout(int aTimeout) { timeout = aTimeout; } public int getTimeout() { return timeout; } /** * Sends a query to cqp. * * @param aQuery * query which shall be sent to cqp */ public void runQuery(String aQuery) throws DataAccessException { exec(STD_QUERY_NAME + " = " + aQuery + " within " + E_SENTENCE); querySuccess = true; } /** * Sends a size command to cqp. * * @return size of the last query sent to cqp via runQuery() */ @Override public int size() throws DataAccessException { if (!querySuccess) { log.warn("A query has to be run via runQuery() before size() can be called."); return 0; } List<String> output = exec("size " + STD_QUERY_NAME); if (output.size() != 1) { throw new InvalidDataAccessResourceUsageException( "'size' did not output the expected amount of lines [1]; was [" + output.size() + "]."); } return Integer.parseInt(output.get(0)); } /** * Sends a cat command to cqp. * * @param aSize * maximum of result lines cat should deliver * @return result of the last query sent to cqp via runQuery() */ public List<EvaluationItem> cat(int aSize) throws DataAccessException { List<String> output = exec("cat " + STD_QUERY_NAME + " 0 " + (aSize - 1)); return parseOutput(output); } /** * Searches for a sentence (represented by the given EvaluationItem) in cqp, and returns it with * context of a given size. * * @param aItem * containing the sentence and its position in the corpus to search for * @param aContextSize * size of the context window to return (in sentences) * @return a list of sentences */ public List<String> getContextAround(EvaluationItem aItem, int aContextSize) { int oldLeftContext = leftContext; int oldRightContext = rightContext; ContextUnit oldContextUnit = contextUnit; String oldLeftDelim = leftDelim; String oldRightDelim = rightDelim; // context has to have +1 to account for the item/sentence itself setContext(aContextSize + 1, aContextSize + 1, ContextUnit.SENTENCE); setLeftDelim(""); setRightDelim(""); send("show +" + E_SENTENCE); send("show -" + ATTR_BEGIN); send("show -" + ATTR_END); // get the match for the first token of the item in the containing text and expand it to // cover its whole sentence runQuery("[begin=\"" + aItem.getBeginOffset() + "\"] :: match.text_id=\"" + aItem.getDocumentId() + "\" expand to 1 sentence"); List<String> output = exec("cat " + STD_QUERY_NAME); // reset params setContext(oldLeftContext, oldRightContext, oldContextUnit); setLeftDelim(oldLeftDelim); setRightDelim(oldRightDelim); send("show -" + E_SENTENCE); send("show +" + ATTR_BEGIN); send("show +" + ATTR_END); return output; } /** * Sends an exit command to cqp; also destroys the cqp process.<br> * After exiting, this CqpManager cannot be used anymore. */ @Override public void close() { if (log.isDebugEnabled()) { log.debug("Killing CQP backend process"); } send("exit"); cqpProcess.destroy(); } /** * Executes a cqp command. * * @param aCmd * command you want to send to cqp * @return output of cqp triggered by the command */ private List<String> exec(String aCmd) throws DataAccessException { String line; List<String> output = new ArrayList<String>(); try { // the .EOL. is essential for checking whether we are finished reading send(aCmd + ";.EOL."); TimeoutReader reader = new TimeoutReader(new InputStreamReader( cqpProcess.getInputStream(), engine.getEncoding(corpus))); reader.setTimeout(timeout); while ((line = reader.readLine()) != null) { if (line.equals(CQP_EOL)) { if (log.isTraceEnabled()) { log.trace(CQP_EOL); } break; } if (log.isTraceEnabled()) { log.trace("<< " + line); } output.add(line); } } catch (IOException e) { throw new InvalidDataAccessResourceUsageException(e.getMessage()); } checkError(); return output; } /** * Checks the stderr for errors thrown by cqp. */ private void checkError() throws InvalidDataAccessResourceUsageException { String line; try { BufferedReader _br = new BufferedReader(new InputStreamReader( cqpProcess.getErrorStream(), engine.getEncoding(corpus))); while (_br.ready()) { line = _br.readLine(); if (log.isErrorEnabled()) { log.error(line); } error.add(line); } } catch (IOException e) { throw new InvalidDataAccessResourceUsageException(e.getMessage()); } if (!error.isEmpty()) { throw new InvalidDataAccessResourceUsageException(join(error, "\n")); } } private void send(String aLine) { PrintWriter pw = new PrintWriter(cqpProcess.getOutputStream()); pw.println(aLine + (!aLine.endsWith(";") ? ";" : "")); pw.flush(); if (log.isTraceEnabled()) { log.trace(">> " + aLine); } } private List<EvaluationItem> parseOutput(List<String> aOutput) { List<EvaluationItem> items = new ArrayList<EvaluationItem>(); String regexp = "\\s*(\\d+):\\s*<" + E_TEXT + "_" + ATTR_ID + "\\s(.+)>:\\s*(.*?)" + Pattern.quote(leftDelim) + "(.*?)" + Pattern.quote(rightDelim) + "(.*?)"; Pattern p = Pattern.compile(regexp); Matcher m = p.matcher(""); // parse results and create EvaluationItems for (String line : aOutput) { m.reset(line); if (m.matches() /* && m.groupCount() == 5 */) { int position = Integer.valueOf(m.group(1)); String documentId = m.group(2).trim(); String lc = m.group(3).trim(); String match = m.group(4).trim(); String rc = m.group(5).trim(); int begin = getBegin(lc, match); int originalMatchBegin = getBegin("", match); int end = getEnd(rc, match); int originalMatchEnd = getEnd("", match); if (!lc.isEmpty()) { lc = getText(lc).trim() + " "; } match = getText(match); if (!rc.isEmpty()) { rc = " " + getText(rc).trim(); } String coveredText = (lc + match + rc); if (coveredText.length() < EvaluationItem.MAX_COLUMN_LENGTH) { EvaluationItem item = new EvaluationItem(corpus, documentId, type, begin, end, coveredText); item.setMatchOnItemText(lc.length(), lc.length() + match.length()); item.setMatchOnOriginalText(originalMatchBegin, originalMatchEnd); items.add(item); } else { log.warn("Ignored oversized match in collection [" + corpus + "] document [" + documentId + "] at [" + begin + "-" + end + "]"); } } else { log.debug("Regexp [" + regexp + "] did not match on [" + line + "]"); } } return items; } private String getText(String aText) { String[] tokens = aText.split(" "); for (int i = 0; i < tokens.length; i++) { // take the string before the penultimate "/" tokens[i] = substringBeforeLast(substringBeforeLast(tokens[i], SEP), SEP); } return StringUtils.join(tokens, " "); } private int getBegin(String lc, String match) { // if lc is empty, use match; use the first token String l = lc.length() > 0 ? lc.split(" ")[0] : match.split(" ")[0]; // take the digits between the two last "/" return Integer.valueOf(substringAfterLast(substringBeforeLast(l, SEP), SEP)); } private int getEnd(String rc, String match) { // if rc is empty, use match; just take the digits after the last "/" String ll = substringAfterLast(rc.length() > 0 ? rc : match, SEP); return Integer.valueOf(ll); } private Process getCQPProcess() throws DataAccessResourceFailureException { try { List<String> cmd = new ArrayList<String>(); cmd.add(engine.getCqpExecutable().getAbsolutePath()); cmd.add("-r"); cmd.add(engine.getRegistryPath().getAbsolutePath()); // run cqp as child process (-c) cmd.add("-c"); if (log.isTraceEnabled()) { log.trace("Invoking [" + StringUtils.join(cmd, " ") + "]"); } final ProcessBuilder pb = new ProcessBuilder(cmd); return pb.start(); } catch (IOException e1) { throw new DataAccessResourceFailureException("Unable to start CQP process", e1); } } public List<String> getError() { return error; } public int getLeftContext() { return leftContext; } private void setLeftContext(int aLeftContext) { leftContext = aLeftContext; } public int getRightContext() { return rightContext; } private void setRightContext(int aRightContext) { rightContext = aRightContext; } public ContextUnit getContextUnit() { return contextUnit; } private void setContextUnit(ContextUnit aContextUnit) { contextUnit = aContextUnit; } public String getLeftDelim() { return leftDelim; } public void setLeftDelim(String aLeftDelim) { leftDelim = aLeftDelim; send("set LeftKWICDelim '" + leftDelim + "'"); } public String getRightDelim() { return rightDelim; } public void setRightDelim(String aRightDelim) { rightDelim = aRightDelim; send("set RightKWICDelim '" + rightDelim + "'"); } public CqpEngine getEngine() { return engine; } public String getCorpus() { return corpus; } public String getVersion() { return version; } public String getMacrosLocation() { return macrosLocation; } public void setMacrosLocation(String aMacrosLocation) { macrosLocation = aMacrosLocation; try { send("define macro < '" + ResourceUtils.getUrlAsFile( ResourceUtils.resolveLocation(macrosLocation, this, null), true) .getAbsolutePath() + "'"); } catch (IOException e) { log.warn("Macro file could not be found: " + e); } } /** * Sets the context window of cqp. * * @param aLeft * size of left context window * @param aRight * size of right context window * @param aContextUnit * unit of context window */ public void setContext(int aLeft, int aRight, ContextUnit aContextUnit) { setLeftContext(aLeft); setRightContext(aRight); setContextUnit(aContextUnit); if (leftContext >= 0) { send("set LeftContext " + leftContext + " " + contextUnit); } if (rightContext >= 0) { send("set RightContext " + rightContext + " " + contextUnit); } } public int getMaxResults() { return maxResults; } @Override public void setMaxResults(int aMaxResults) { maxResults = aMaxResults; } @Override public List<EvaluationItem> execute() { return cat(maxResults); } }