/* * Copyright (c) 2011, SOCIETIES Consortium (WATERFORD INSTITUTE OF TECHNOLOGY (TSSG), HERIOT-WATT UNIVERSITY (HWU), SOLUTA.NET * (SN), GERMAN AEROSPACE CENTRE (Deutsches Zentrum fuer Luft- und Raumfahrt e.V.) (DLR), Zavod za varnostne tehnologije * informacijske držbe in elektronsko poslovanje (SETCCE), INSTITUTE OF COMMUNICATION AND COMPUTER SYSTEMS (ICCS), LAKE * COMMUNICATIONS (LAKE), INTEL PERFORMANCE LEARNING SOLUTIONS LTD (INTEL), PORTUGAL TELECOM INOAÇÃO, SA (PTIN), IBM Corp., * INSTITUT TELECOM (ITSUD), AMITEC DIACHYTI EFYIA PLIROFORIKI KAI EPIKINONIES ETERIA PERIORISMENIS EFTHINIS (AMITEC), TELECOM * ITALIA S.p.a.(TI), TRIALOG (TRIALOG), Stiftelsen SINTEF (SINTEF), NEC EUROPE LTD (NEC)) * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.societies.orchestration.cpa.test; /** * Created with IntelliJ IDEA. * User: Bjørn Magnus Mathisen * Date: 10.10.12 * Time: 19:11 */ import gate.*; import gate.corpora.RepositioningInfo; import gate.creole.ANNIEConstants; import gate.creole.SerialAnalyserController; import gate.util.GateException; import gate.util.Out; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.net.URL; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.Vector; /** * This class illustrates how to use ANNIE as a sausage machine * in another application - put ingredients in one end (URLs pointing * to documents) and get sausages (e.g. Named Entities) out the * other end. * <P><B>NOTE:</B><BR> * For simplicity's sake, we don't do any exception handling. */ public class StandAloneAnnie { /** The Corpus Pipeline application to contain ANNIE */ private SerialAnalyserController annieController; /** * Initialise the ANNIE system. This creates a "corpus pipeline" * application that can be used to run sets of documents through * the extraction system. */ public void initAnnie() throws GateException { Out.prln("Initialising ANNIE..."); // create a serial analyser controller to run ANNIE with annieController = (SerialAnalyserController) Factory.createResource( "gate.creole.SerialAnalyserController", Factory.newFeatureMap(), Factory.newFeatureMap(), "ANNIE_" + Gate.genSym() ); // load each PR as defined in ANNIEConstants for(int i = 0; i < ANNIEConstants.PR_NAMES.length; i++) { FeatureMap params = Factory.newFeatureMap(); // use default parameters ProcessingResource pr = (ProcessingResource) Factory.createResource(ANNIEConstants.PR_NAMES[i], params); // add the PR to the pipeline controller annieController.add(pr); } // for each ANNIE PR Out.prln("...ANNIE loaded"); } // initAnnie() /** Tell ANNIE's controller about the corpus you want to run on */ public void setCorpus(Corpus corpus) { annieController.setCorpus(corpus); } // setCorpus /** Run ANNIE */ public void execute() throws GateException { Out.prln("Running ANNIE..."); annieController.execute(); Out.prln("...ANNIE complete"); } // execute() /** * Run from the command-line, with a list of URLs as argument. * <P><B>NOTE:</B><BR> * This code will run with all the documents in memory - if you * want to unload each from memory after use, add code to store * the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); //Gate.setSiteConfigFile(new File("./src/test/resources/gate.xml")); Gate.setPluginsHome(new File("./src/test/")); Gate.init(); // Load ANNIE plugin File gateHome = Gate.getGateHome(); File pluginsHome = new File("./src/test/resources/", "plugins"); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "/ANNIE").toURL()); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = (Corpus) Factory.createResource("gate.corpora.CorpusImpl"); for(int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while(iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '"+file.getAbsolutePath()+"'"); if(originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : "+originalContent); Out.prln("Repositioning: "+info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); // do something usefull with the XML here! // Out.prln("'"+xmlDocument+"'"); } // for each doc } // main /** * */ public static class SortedAnnotationList extends Vector { public SortedAnnotationList() { super(); } // SortedAnnotationList public boolean addSortedExclusive(Annotation annot) { Annotation currAnot = null; // overlapping check for (int i=0; i<size(); ++i) { currAnot = (Annotation) get(i); if(annot.overlaps(currAnot)) { return false; } // if } // for long annotStart = annot.getStartNode().getOffset().longValue(); long currStart; // insert for (int i=0; i < size(); ++i) { currAnot = (Annotation) get(i); currStart = currAnot.getStartNode().getOffset().longValue(); if(annotStart < currStart) { insertElementAt(annot, i); /* Out.prln("Insert start: "+annotStart+" at position: "+i+" size="+size()); Out.prln("Current start: "+currStart); */ return true; } // if } // for int size = size(); insertElementAt(annot, size); //Out.prln("Insert start: "+annotStart+" at size position: "+size); return true; } // addSorted } // SortedAnnotationList } // class StandAloneAnnie