/*******************************************************************************
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.csniper.webapp.search.tgrep;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.dao.DataAccessResourceFailureException;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.EvaluationItem;
import de.tudarmstadt.ukp.csniper.webapp.search.PreparedQuery;
/**
* This class provides a super-slimmed down API for Tgrep2.
*
* @author Erik-Lân Do Dinh
*
*/
public class TgrepQuery
implements PreparedQuery
{
private final Log log = LogFactory.getLog(getClass());
private final static int META_DOCUMENT_ID = 0;
private final static int META_BEGIN_OFFSET = 1;
private final static int META_END_OFFSET = 2;
private final static int LINES_PER_MATCH = 4;
private static final String LEFT_BRACKET = "-LRB-";
private static final String RIGHT_BRACKET = "-RRB-";
private final TgrepEngine engine;
private final String type;
private final String corpus;
private final String query;
private Process tgrep;
private int size = -1;
private int maxResults = -1;
public TgrepQuery(TgrepEngine aEngine, String aType, String aCorpus, String aQuery)
{
engine = aEngine;
type = aType;
corpus = aCorpus;
query = aQuery;
}
@Override
public int size()
{
return size;
}
@Override
public void close()
{
if (log.isDebugEnabled()) {
log.debug("Killing Tgrep2 process.");
}
if (tgrep != null) {
tgrep.destroy();
}
}
@Override
public void setMaxResults(int aMaxResults)
{
maxResults = aMaxResults;
}
@Override
public List<EvaluationItem> execute()
{
BufferedReader brInput = null;
BufferedReader brError = null;
List<String> output = new ArrayList<String>();
List<String> error = new ArrayList<String>();
try {
List<String> cmd = new ArrayList<String>();
File exe = engine.getTgrepExecutable();
if (!exe.canExecute()) {
exe.setExecutable(true);
}
cmd.add(exe.getAbsolutePath());
// specify corpus
cmd.add("-c");
cmd.add(engine.getCorpusPath(corpus));
// only one match per sentence
cmd.add("-f");
// print options
cmd.add("-m");
// comment
// full sentence
// match begin token index
// match end token index
cmd.add("%c\\n%tw\\n%ym\\n%zm\\n");
// pattern to search for
cmd.add(query);
if (log.isTraceEnabled()) {
log.trace("Invoking [" + StringUtils.join(cmd, " ") + "]");
}
final ProcessBuilder pb = new ProcessBuilder(cmd);
tgrep = pb.start();
brInput = new BufferedReader(new InputStreamReader(tgrep.getInputStream(), "UTF-8"));
brError = new BufferedReader(new InputStreamReader(tgrep.getErrorStream(), "UTF-8"));
String line;
while ((line = brInput.readLine()) != null) {
if (log.isTraceEnabled()) {
log.trace("<< " + line);
}
output.add(line);
}
while ((line = brError.readLine()) != null) {
if (log.isErrorEnabled()) {
log.error(line);
}
error.add(line);
}
if (!error.isEmpty()) {
throw new IOException(StringUtils.join(error, " "));
}
}
catch (IOException e) {
throw new DataAccessResourceFailureException("Unable to start Tgrep process.", e);
}
finally {
IOUtils.closeQuietly(brInput);
IOUtils.closeQuietly(brError);
}
size = output.size() / LINES_PER_MATCH;
if (maxResults >= 0 && size > maxResults) {
return parseOutput(output.subList(0, LINES_PER_MATCH * maxResults));
}
else {
return parseOutput(output);
}
}
private List<EvaluationItem> parseOutput(List<String> aOutput)
{
List<EvaluationItem> items = new ArrayList<EvaluationItem>();
if (aOutput.size() % LINES_PER_MATCH > 0) {
throw new DataAccessResourceFailureException("Tgrep2 produced [" + aOutput.size()
+ "] output lines, but should have produced a multiple of [" + LINES_PER_MATCH
+ "].");
}
else {
String[] comment;
String text;
int tokenBeginIndex;
int tokenEndIndex;
for (Iterator<String> it = aOutput.iterator(); it.hasNext();) {
// comment - split into documentId, beginOffset, endOffset
comment = it.next().substring(2).split(TgrepEngine.COMMENT_SEPARATOR);
if (comment.length < 3) {
throw new DataAccessResourceFailureException(
"The corpus contains a malformed comment line ["
+ StringUtils.join(comment, " ,") + "].");
}
String documentId = comment[META_DOCUMENT_ID];
int beginOffset = Integer.parseInt(comment[META_BEGIN_OFFSET]);
int endOffset = Integer.parseInt(comment[META_END_OFFSET]);
// text string - trim and replace bracket placeholders
text = it.next().trim();
text = StringUtils.replace(text, LEFT_BRACKET, "(");
text = StringUtils.replace(text, RIGHT_BRACKET, ")");
// token index of first token in match (tgrep indices are 1-based, make them
// 0-based)
tokenBeginIndex = Integer.parseInt(it.next()) - 1;
// token index of last token in match (tgrep indices are 1-based, make them 0-based)
tokenEndIndex = Integer.parseInt(it.next()) - 1;
// set corpus position to -1; this is cqp specific and we don't use it atm
EvaluationItem item = new EvaluationItem(corpus, documentId, type, beginOffset,
endOffset, text);
// text-based (i.e. sentence-based) offsets (+1 to skip the whitespace itself)
int matchBegin = StringUtils.ordinalIndexOf(text, " ", tokenBeginIndex) + 1;
int matchEnd = StringUtils.ordinalIndexOf(text, " ", tokenEndIndex + 1);
item.setMatchOnItemText(matchBegin, matchEnd);
item.setMatchOnOriginalTextViaTokenIndicesAndLookGoodWhileDoingSo(tokenBeginIndex,
tokenEndIndex);
items.add(item);
}
}
return items;
}
}