package org.opensextant.service.processing; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import org.opensextant.placedata.PlaceCandidate; import org.opensextant.service.OpenSextantExtractorResource; import org.opensextant.tagger.Document; import org.opensextant.tagger.Match; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import gate.Annotation; import gate.AnnotationSet; import gate.CorpusController; import gate.Factory; import gate.FeatureMap; import gate.Gate; import gate.Utils; import gate.creole.ResourceInstantiationException; import gate.persist.PersistenceException; import gate.util.GateException; import gate.util.persistence.PersistenceManager; public class DocumentProcessorPool { private Map<String, BlockingQueue<DocumentProcessor>> poolMap = new HashMap<String, BlockingQueue<DocumentProcessor>>(); private long docsProcessedCount; private long docsFailedCount; /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(DocumentProcessorPool.class); public DocumentProcessorPool(Properties prop) { String gateHomeString = prop.getProperty("os.service.gate.home"); File gateHome = new File(gateHomeString); String gappHomeString = prop.getProperty("os.service.gapp.home"); File gappHome = new File(gappHomeString); String[] apps = prop.getProperty("os.service.appnames").split(","); Gate.setGateHome(gateHome); Gate.setUserConfigFile(new File(gateHome, "user-gate.xml")); try { Gate.init(); } catch (GateException e) { LOGGER.error("Couldn't init GATE", e); } for (String app : apps) { String gapp = prop.getProperty("os.service.app." + app + ".gapp"); int poolSize = Integer.parseInt(prop.getProperty("os.service.app." + app + ".poolsize")); File gappFile = new File(gappHome, gapp); addProcess(app, gappFile, poolSize); } } private void addProcess(String processName, File gappFile, int poolSize) { CorpusController template = null; try { template = (CorpusController) PersistenceManager.loadObjectFromFile(gappFile); } catch (PersistenceException e) { LOGGER.error("Couldn't load GAPP file" + gappFile.getName(), e); } catch (ResourceInstantiationException e) { LOGGER.error("Couldn't load GAPP file" + gappFile.getName(), e); } catch (IOException e) { LOGGER.error("Couldn't load GAPP file" + gappFile.getName(), e); } if (poolSize > 0) { ArrayBlockingQueue<DocumentProcessor> pool = new ArrayBlockingQueue<DocumentProcessor>(poolSize); DocumentProcessor dp = new DocumentProcessor(template); pool.add(dp); for (int i = 0; i < poolSize - 1; i++) { try { CorpusController tmp = (CorpusController) Factory.duplicate(template); DocumentProcessor dpTmp = new DocumentProcessor(tmp); pool.add(dpTmp); } catch (ResourceInstantiationException e) { LOGGER.error("Couldn't create controller for " + gappFile.getName(), e); } } poolMap.put(processName, pool); } } private gate.Document process(String name, gate.Document doc) { DocumentProcessor processor = null; try { processor = poolMap.get(name).take(); } catch (InterruptedException e) { LOGGER.error("Couldn't get a processor from the pool", e); } if (processor == null) { LOGGER.error("Couldn't get a processor from the pool"); return doc; } try { processor.process(doc); } catch (Exception e) { docsFailedCount++; LOGGER.error("Document failed to process" + doc.getName(), e); } finally { docsProcessedCount++; poolMap.get(name).add(processor); } return doc; } public void cleanup() { for (String name : poolMap.keySet()) { for (DocumentProcessor dp : poolMap.get(name)) { dp.cleanup(); } } } public Set<String> getProcessNames() { return poolMap.keySet(); } public Set<String> getResultFormats() { return OpenSextantExtractorResource.getFormats(); } public int available(String name) { return poolMap.get(name).size(); } public Map<String, Integer> available() { Map<String, Integer> avail = new HashMap<String, Integer>(); for (String name : poolMap.keySet()) { avail.put(name, available(name)); } return avail; } @Override public String toString() { StringBuilder buff = new StringBuilder(); buff.append("Extractor\tNumber in pool\n"); for (String name : poolMap.keySet()) { buff.append(name).append("\t"); buff.append(poolMap.get(name).size()); buff.append("\n"); } return buff.toString(); } public Document process(String extractType, String content) { gate.Document gateDoc = null; try { gateDoc = Factory.newDocument(content); } catch (ResourceInstantiationException e) { LOGGER.error("Couldn't create new document from given string", e); } return gateDocToDocument(process(extractType, gateDoc)); } public Document process(String extractType, File content) { gate.Document gateDoc = null; try { gateDoc = Factory.newDocument(content.toURI().toURL()); } catch (ResourceInstantiationException e) { LOGGER.error("Couldn't create new document from " + content.getName(), e); } catch (MalformedURLException e) { LOGGER.error("Couldn't create new document from " + content.getName(), e); } return gateDocToDocument(process(extractType, gateDoc)); } public Document process(String extractType, URL content) { gate.Document gateDoc = null; try { gateDoc = Factory.newDocument(content); } catch (ResourceInstantiationException e) { LOGGER.error("Couldnt create content from given URL", e); return null; } return gateDocToDocument(process(extractType, gateDoc)); } private Document gateDocToDocument(gate.Document doc) { Set<String> featureNameSet = new HashSet<String>(); featureNameSet.add("isEntity"); AnnotationSet entitySet = doc.getAnnotations().get(null, featureNameSet); Document db = new Document(); db.setContent(doc.getContent().toString()); for (Annotation a : entitySet) { String type = a.getType(); if ("ENTITY".equalsIgnoreCase(type)) { continue; } Match tmpAnno = new Match(); tmpAnno.setStart(a.getStartNode().getOffset()); tmpAnno.setEnd(a.getEndNode().getOffset()); tmpAnno.setType(type); tmpAnno.setMatchText(Utils.cleanStringFor(doc, a)); FeatureMap fm = a.getFeatures(); /* special handling for PLACEs */ if ("PLACE".equalsIgnoreCase(type)) { tmpAnno.getFeatures().put("place", fm.get("bestPlace")); tmpAnno.getFeatures().put("hierarchy", fm.get("hierarchy")); PlaceCandidate pc = (PlaceCandidate) fm.get("placeCandidate"); tmpAnno.getFeatures().put("candidates", pc.getPlaces()); } else { for (Entry<Object, Object> e : fm.entrySet()) { String k = (String) e.getKey(); Object v = e.getValue(); tmpAnno.getFeatures().put(k, v); } } db.addAnno(tmpAnno); } // cleanup resources Factory.deleteResource(doc); return db; } /** * @return the docsProcessedCount */ public long getDocsProcessedCount() { return docsProcessedCount; } /** * @return the docsFailedCount */ public long getDocsFailedCount() { return docsFailedCount; } }