package eu.project.ttc.readers;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.utils.IstexUtils;
/**
*
* Reads collections from a ISTEX
*
* @author Damien Cram
*
*/
public class IstexCollectionReader extends CollectionReader_ImplBase {
private static final Logger logger = LoggerFactory.getLogger(IstexCollectionReader.class);
public static final String PARAM_LANGUAGE = "CorpusLanguage";
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true)
private Lang lang;
public static final String PARAM_API_URL = "ApiURL";
@ConfigurationParameter(name = PARAM_API_URL, mandatory = true)
private String urlStr;
private URI apiURI;
public static final String PARAM_IGNORE_LANGUAGE_ERRORS = "ignoreLanguageErrors";
@ConfigurationParameter(name = PARAM_IGNORE_LANGUAGE_ERRORS, mandatory = false, defaultValue = "false")
private boolean ignoreLanguageErrors;
public static final String PARAM_ID_LIST = "idList";
@ConfigurationParameter(name = PARAM_API_URL, mandatory = true)
private String idListStr;
private List<String> idList;
private List<URL> urlList;
private int currentIndex = 0;
private int cumulatedSize = 0;
private long lastTop;
private CompletionService<String> pool;
private ExecutorService threadPool;
@Override
public void initialize() throws ResourceInitializationException {
logger.debug("Initializing Istex collection reader");
this.lang = Lang.forName((String) getConfigParameterValue(PARAM_LANGUAGE));
this.urlStr = (String) getConfigParameterValue(PARAM_API_URL);
this.ignoreLanguageErrors = (Boolean) getConfigParameterValue(PARAM_IGNORE_LANGUAGE_ERRORS);
this.idListStr = (String) getConfigParameterValue(PARAM_ID_LIST);
this.idList = Splitter.on(",").splitToList(this.idListStr);
try {
this.apiURI = new URI(urlStr);
} catch (URISyntaxException e) {
logger.error("Could not parse Istex API's url: {}", this.urlStr);
throw new ResourceInitializationException(e);
}
threadPool = Executors.newFixedThreadPool(4);
pool = new ExecutorCompletionService<String>(threadPool);
urlList = Lists.newArrayList();
for (String documentId:idList) {
String documentPath = String.format("/document/%s/", documentId);
URL documentURL;
try {
documentURL = new URL(apiURI.toURL(), documentPath);
urlList.add(documentURL);
} catch (MalformedURLException e) {
throw new ResourceInitializationException(e);
}
}
new Thread() {
@Override
public void run() {
for(URL documentURL:urlList) {
if(logger.isTraceEnabled())
logger.trace("Submitting retrieve task for {}", documentURL);
pool.submit(new GetDocumentTask(documentURL));
}
}
}.start();
logger.debug("End of Istex collection reader initialization");
}
private final class GetDocumentTask implements Callable<String> {
private URL documentURL;
public GetDocumentTask(URL documentURL) {
super();
this.documentURL = documentURL;
}
public String call() {
InputStream openStream;
try {
if(logger.isTraceEnabled())
logger.trace("Executing retrieve task for {}", documentURL);
openStream = documentURL.openStream();
Scanner scanner = new Scanner(openStream);
Scanner s = scanner.useDelimiter("\\A");
String string = s.hasNext() ? s.next() : "";
scanner.close();
return string;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
@Override
public void getNext(CAS cas) throws IOException, CollectionException {
if (currentIndex == 0) {
logger.debug("Processing the first document of istex collection");
lastTop = System.currentTimeMillis();
}
URL documentUrl = urlList.get(currentIndex++);
long top = System.currentTimeMillis();
if (top - lastTop > 5000l) {
lastTop = top;
logger.info("{}% - Processing Istex document {} on {}",
String.format("%3d", (currentIndex * 100) / idList.size()), currentIndex, idList.size());
}
SourceDocumentInformation sdi;
try {
sdi = new SourceDocumentInformation(cas.getJCas());
sdi.setUri(documentUrl.toString());
String text = toDocumentText(documentUrl);
cas.setDocumentText(text);
cas.setDocumentLanguage(lang.getCode());
sdi.setDocumentSize(text.length());
this.cumulatedSize += text.length();
sdi.setCumulatedDocumentSize(this.cumulatedSize);
sdi.setCorpusSize(-1);
sdi.setBegin(0);
sdi.setEnd(text.length());
sdi.setOffsetInSource(0);
sdi.setDocumentIndex(currentIndex);
sdi.setNbDocuments(this.idList.size());
boolean lastSegment = currentIndex == idList.size() - 1;
sdi.setLastSegment(lastSegment);
sdi.addToIndexes();
if(lastSegment)
threadPool.shutdown();
} catch (CASException e) {
throw new CollectionException(e);
}
}
@SuppressWarnings("unchecked")
private String toDocumentText(URL documentURL) throws IOException {
String json;
try {
json = pool.take().get();
ObjectMapper mapper = new ObjectMapper(); // can reuse, share globally
Map<String, Object> map = mapper.readValue(json, Map.class);
IstexDocument document = new IstexDocument();
document.setLanguage(((List<String>) map.get("language")).get(0));
document.setTitle((String) map.get("title"));
document.setAbstract((String) map.get("abstract"));
Lang docLang = IstexUtils.toTermSuiteLang(document.getLanguage());
if (docLang != this.lang) {
String msg = String.format("Bad language for document %s. Expected: %s, actual: %s", documentURL.toString(),
this.lang.getCode(), docLang.getCode());
if (ignoreLanguageErrors) {
logger.warn(msg);
return "";
} else
throw new IllegalArgumentException(msg);
}
return String.format("%s . %s", document.getTitle(), document.getAbstract());
} catch (Exception e) {
logger.error("An error occurrence with the Istex client pool");
throw new IllegalStateException(e);
}
}
@Override
public boolean hasNext() throws IOException, CollectionException {
return currentIndex < this.idList.size();
}
@Override
public Progress[] getProgress() {
// TODO Auto-generated method stub
return null;
}
@Override
public void close() throws IOException {
}
}