/*
* Copyright 2012 FundaciĆ³ Barcelona Media
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.barcelonamedia.uima.reader.solr;
import java.io.IOException;
import java.util.List;
import java.util.logging.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
//import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.uimafit.component.CasCollectionReader_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
public class SolrCollectionReader extends CasCollectionReader_ImplBase{
/** The logger object. */
private static final Logger logger = Logger.getLogger(SolrCollectionReader.class.toString());
@ConfigurationParameter(name="SolrUrl", mandatory=true, defaultValue="http://localhost:8983/solr/",
description="URL of Solr service")
private String server;
@ConfigurationParameter(name="IdField", mandatory=true, defaultValue="id",
description="Solr field that contains the document ID")
private String idField;
@ConfigurationParameter(name="TextField", mandatory=true, defaultValue="text",
description="Solr field that contains the document text")
private String textField;
@ConfigurationParameter(name="Language", mandatory=false,
description="the language of the document")
private String language;
@ConfigurationParameter(name="SolrQuery", mandatory=true, defaultValue="*:*",
description="the query to select documents")
private String queryString;
// current document
private int documentCounter;
// list of document ids to retrieve
// TODO: the whole list of ids is kept in memory, move to fully streaming solution
private List<String> documentList;
// total number of documents, should be equivalent to documentList.size()
private long documentNumber;
private SolrDao solrDao;
/**
* Initialize the component. Retrieve the parameters and process them,
* parsing the field descriptions and preparing the structures needed to
* process the documents.
*
* @param aContext
* The UIMA context.
*
* @throws ResourceInitializationException
* If an error occurs with some resource.
*
*/
public void initialize(UimaContext context) throws ResourceInitializationException {
System.out.println("SolrCollectionReader: initialize()...");
logger.info("initialize()...");
try {
this.solrDao = new SolrDao(server,idField,textField);
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
this.documentCounter = 0;
try {
this.documentNumber = this.solrDao.getDocNum(queryString);
this.documentList = this.solrDao.getDocIds(queryString);
} catch (SolrServerException e) {
throw new ResourceInitializationException(e);
}
logger.info("initialize() - Done.");
}
public void getNext(CAS aCAS) throws IOException, CollectionException {
JCas jcas;
try{
jcas = aCAS.getJCas();
}
catch(CASException e){
throw new CollectionException(e);
}
String documentId = this.documentList.get(this.documentCounter);
String documentText;
try {
documentText = this.solrDao.getDocText(documentId);
} catch (SolrServerException e) {
throw new CollectionException(e);
}
jcas.setDocumentText(documentText);
// set language if it was explicitly specified as a configuration parameter
if(this.language != null){
jcas.setDocumentLanguage(this.language);
}
SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
srcDocInfo.setUri("file:///" + documentId); // needs to be a valid URI with a known protocol
srcDocInfo.setOffsetInSource(0);
srcDocInfo.setDocumentSize((int) documentText.length());
this.documentCounter += 1;
srcDocInfo.setLastSegment(true);
srcDocInfo.addToIndexes();
}
public boolean hasNext() throws IOException, CollectionException {
return (this.documentCounter<this.documentNumber)? true: false;
}
public Progress[] getProgress(){
return new Progress[] { new ProgressImpl(this.documentCounter, (int) this.documentNumber, Progress.ENTITIES) };
}
public void close() throws IOException {
}
}