/*******************************************************************************
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.csniper.webapp.search.xmi;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.cas.impl.Serialization.deserializeCASComplete;
import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.Collections;
import java.util.List;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.uima.UIMAException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.impl.CASCompleteSerializer;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.internal.util.Timer;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.fit.util.JCasUtil;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.EvaluationItem;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.ItemContext;
import de.tudarmstadt.ukp.csniper.webapp.search.ContextProvider;
import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
public class SerializedCasContextProvider
implements ContextProvider
{
private static final ThreadLocal<JCasState> jcasThreadLocal = new ThreadLocal<JCasState>()
{
@Override
protected JCasState initialValue()
{
try {
JCasState state = new JCasState();
state.cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null,
null);
return state;
}
catch (UIMAException e) {
throw new IllegalStateException(e);
}
};
};
private static class JCasState
{
private String collectionId;
private String documentId;
private CAS cas;
}
private static Log log = LogFactory.getLog(SerializedCasContextProvider.class);
private static final String BIN = "bin";
private boolean outputPos = true;
private CorpusService corpusService;
@Override
public void setCorpusService(CorpusService aCorpusService)
{
corpusService = aCorpusService;
}
@Override
public void setOutputPos(boolean showPos)
{
outputPos = showPos;
}
@Override
public ItemContext getContext(EvaluationItem aItem, int aLeftSize, int aRightSize)
throws IOException
{
Timer timer = new Timer();
File base = new File(new File(corpusService.getRepositoryPath(), aItem.getCollectionId()
.toUpperCase()), BIN);
String docId = aItem.getDocumentId();
JCasState state = jcasThreadLocal.get();
if ((state.documentId == null) || (state.collectionId == null)
|| !StringUtils.equals(state.documentId, docId)
|| !StringUtils.equals(state.collectionId, aItem.getCollectionId())) {
timer.start();
ObjectInputStream is = null;
try {
// No need to reset the CAS
is = new ObjectInputStream(new XZCompressorInputStream(new FileInputStream(
new File(base, docId + ".ser.xz"))));
CASCompleteSerializer serializer = (CASCompleteSerializer) is.readObject();
deserializeCASComplete(serializer, (CASImpl) state.cas);
state.documentId = aItem.getDocumentId();
state.collectionId = aItem.getCollectionId();
}
catch (IllegalStateException e) {
throw new IOException(e);
}
catch (ClassNotFoundException e) {
throw new IOException(e);
}
finally {
closeQuietly(is);
}
timer.stop();
log.debug("Reading the CAS took " + timer.getTime() + "ms");
}
else {
log.debug("Reusing CAS");
}
timer.reset();
timer.start();
// text offset based
String text = state.cas.getDocumentText();
// Absolute offsets
int windowBegin = Math.max(0, (int) aItem.getBeginOffset() - aLeftSize);
int windowEnd = Math.min(text.length(), (int) aItem.getEndOffset() + aRightSize);
// Relative offsets
int unitBegin = (int) aItem.getBeginOffset() - windowBegin;
int unitEnd = (int) aItem.getEndOffset() - windowBegin;
StringBuilder windowText = new StringBuilder(text.substring(windowBegin, windowEnd));
List<Token> tokens;
try {
tokens = JCasUtil.selectCovered(state.cas.getJCas(), Token.class,
(int) aItem.getBeginOffset(), (int) aItem.getEndOffset());
}
catch (CASException e) {
throw new IOException(e);
}
int unitEndDisplacement = 0;
int matchEndDisplacement = 0;
int matchBeginDisplacement = 0;
boolean anyMatchSet = false;
int matchBeginOffset = aItem.getOriginalTextMatchBegin();
int matchEndOffset = aItem.getOriginalTextMatchEnd();
if (aItem.isOriginalMatchSet()) {
matchBeginOffset = aItem.getOriginalTextMatchBegin();
matchEndOffset = aItem.getOriginalTextMatchEnd();
anyMatchSet = true;
}
else if (aItem.isTokenMatchSet()) {
matchBeginOffset = tokens.get(aItem.getTokenMatchBegin()).getBegin();
matchEndOffset = tokens.get(aItem.getTokenMatchEnd()).getEnd();
anyMatchSet = true;
}
Collections.reverse(tokens);
// compute actual offsets if token based offsets are set
if (outputPos) {
for (Token t : tokens) {
if (t.getPos() != null && t.getPos().getPosValue() != null) {
String postfix = "/" + t.getPos().getPosValue();
windowText.insert(t.getEnd() - windowBegin, postfix);
unitEndDisplacement += postfix.length();
if (anyMatchSet) {
if ((t.getEnd() <= matchEndOffset) && (t.getBegin() >= matchBeginOffset)) {
matchEndDisplacement += postfix.length();
}
if (t.getEnd() <= matchBeginOffset) {
matchBeginDisplacement += postfix.length();
}
}
}
}
}
ItemContext ctx = new ItemContext(windowText.toString(), windowBegin, windowEnd, unitBegin,
unitEnd + unitEndDisplacement);
if (anyMatchSet) {
ctx.setMatch(matchBeginOffset - windowBegin + matchBeginDisplacement, matchEndOffset
- windowBegin + matchBeginDisplacement + matchEndDisplacement);
}
ctx.setTextLength(text.length());
timer.stop();
log.debug("Extracting the context took " + timer.getTime() + "ms");
return ctx;
}
}