/*******************************************************************************
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.csniper.webapp.search.xmi;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.internal.util.Timer;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.fit.util.JCasUtil;
import org.xml.sax.SAXException;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.EvaluationItem;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.ItemContext;
import de.tudarmstadt.ukp.csniper.webapp.search.ContextProvider;
import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
public class XmiContextProvider
implements ContextProvider
{
private static final ThreadLocal<JCasState> jcasThreadLocal = new ThreadLocal<JCasState>()
{
@Override
protected JCasState initialValue()
{
try {
JCasState state = new JCasState();
// Try to get a bit more initial heap to improve performance.
// See: https://issues.apache.org/jira/browse/UIMA-2385
Properties props = new Properties();
props.setProperty(UIMAFramework.CAS_INITIAL_HEAP_SIZE,
String.valueOf(CASImpl.DEFAULT_INITIAL_HEAP_SIZE * 4));
state.jcas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null,
props).getJCas();
return state;
}
catch (UIMAException e) {
throw new IllegalStateException(e);
}
};
};
private static class JCasState
{
private String collectionId;
private String documentId;
private JCas jcas;
}
private static Log log = LogFactory.getLog(XmiContextProvider.class);
private static final String XMI = "xmi";
private boolean outputPos = true;
private CorpusService corpusService;
@Override
public void setCorpusService(CorpusService aCorpusService)
{
corpusService = aCorpusService;
}
@Override
public void setOutputPos(boolean showPos)
{
outputPos = showPos;
}
@Override
public ItemContext getContext(EvaluationItem aItem, int aLeftSize, int aRightSize)
throws IOException
{
Timer timer = new Timer();
File base = new File(new File(corpusService.getRepositoryPath(), aItem.getCollectionId()
.toUpperCase()), XMI);
String docId = aItem.getDocumentId();
JCasState state = jcasThreadLocal.get();
// FIXME sometimes cas is not being reused (because of state.documentId==null - is this only
// available for a limited timed?)
if ((state.documentId == null) || (state.collectionId == null)
|| !StringUtils.equals(state.documentId, docId)
|| !StringUtils.equals(state.collectionId, aItem.getCollectionId())) {
timer.start();
InputStream is = null;
try {
// No need to reset the CAS - the XmiCasDeserializer does that
is = new GZIPInputStream(new FileInputStream(new File(base, docId + ".xmi.gz")));
XmiCasDeserializer.deserialize(is, state.jcas.getCas());
state.documentId = aItem.getDocumentId();
state.collectionId = aItem.getCollectionId();
}
catch (IllegalStateException e) {
throw new IOException(e);
}
catch (SAXException e) {
throw new IOException(e);
}
finally {
closeQuietly(is);
}
timer.stop();
log.debug("Reading the XMI took " + timer.getTime() + "ms");
}
else {
log.debug("Reusing CAS");
}
timer.reset();
timer.start();
// text offset based
String text = state.jcas.getDocumentText();
// Absolute offsets
int windowBegin = Math.max(0, (int) aItem.getBeginOffset() - aLeftSize);
int windowEnd = Math.min(text.length(), (int) aItem.getEndOffset() + aRightSize);
// Relative offsets
int unitBegin = (int) aItem.getBeginOffset() - windowBegin;
int unitEnd = (int) aItem.getEndOffset() - windowBegin;
StringBuilder windowText = new StringBuilder(text.substring(windowBegin, windowEnd));
List<Token> tokens = JCasUtil.selectCovered(state.jcas, Token.class,
(int) aItem.getBeginOffset(), (int) aItem.getEndOffset());
int unitEndDisplacement = 0;
int matchEndDisplacement = 0;
int matchBeginDisplacement = 0;
boolean anyMatchSet = false;
int matchBeginOffset = aItem.getOriginalTextMatchBegin();
int matchEndOffset = aItem.getOriginalTextMatchEnd();
if (aItem.isOriginalMatchSet()) {
matchBeginOffset = aItem.getOriginalTextMatchBegin();
matchEndOffset = aItem.getOriginalTextMatchEnd();
anyMatchSet = true;
}
else if (aItem.isTokenMatchSet()) {
matchBeginOffset = tokens.get(aItem.getTokenMatchBegin()).getBegin();
matchEndOffset = tokens.get(aItem.getTokenMatchEnd()).getEnd();
anyMatchSet = true;
}
Collections.reverse(tokens);
// default: output pos with tokens
if (outputPos) {
for (Token t : tokens) {
if (t.getPos() != null && t.getPos().getPosValue() != null) {
String postfix = "/" + t.getPos().getPosValue();
windowText.insert(t.getEnd() - windowBegin, postfix);
unitEndDisplacement += postfix.length();
if (anyMatchSet) {
if ((t.getEnd() <= matchEndOffset) && (t.getBegin() >= matchBeginOffset)) {
matchEndDisplacement += postfix.length();
}
if (t.getEnd() <= matchBeginOffset) {
matchBeginDisplacement += postfix.length();
}
}
}
}
}
ItemContext ctx = new ItemContext(windowText.toString(), windowBegin, windowEnd, unitBegin,
unitEnd + unitEndDisplacement);
if (anyMatchSet) {
ctx.setMatch(matchBeginOffset - windowBegin + matchBeginDisplacement, matchEndOffset
- windowBegin + matchBeginDisplacement + matchEndDisplacement);
}
ctx.setTextLength(text.length());
timer.stop();
log.debug("Extracting the context took " + timer.getTime() + "ms");
return ctx;
}
}