package uk.bl.wa.nlp.analysers; ///** // * A nice idea, but seems to rely of many local resources from the local filesystem, which is awkward. // // i.e. the following is not enough: // // <dependency> // <groupId>uk.ac.gate</groupId> // <artifactId>gate-core</artifactId> // <version>7.1</version> // </dependency> // // */ //package uk.bl.wa.analyser.text; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ // // // ///* // * StandAloneAnnie.java // * // * // * Copyright (c) 2000-2001, The University of Sheffield. // * // * This file is part of GATE (see http://gate.ac.uk/), and is free // * software, licenced under the GNU Library General Public License, // * Version 2, June1991. // * // * A copy of this licence is included in the distribution in the file // * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. // * // * hamish, 29/1/2002 // * // * $Id: StandAloneAnnie.java,v 1.6 2006/01/09 16:43:22 ian Exp $ // */ // //import gate.Annotation; //import gate.AnnotationSet; //import gate.Corpus; //import gate.CorpusController; //import gate.Document; //import gate.Factory; //import gate.FeatureMap; //import gate.Gate; //import gate.GateConstants; //import gate.corpora.RepositioningInfo; //import gate.util.GateException; //import gate.util.Out; //import gate.util.persistence.PersistenceManager; // //import java.io.File; //import java.io.FileWriter; //import java.io.IOException; //import java.net.URL; //import java.util.HashSet; //import java.util.Iterator; //import java.util.Set; //import java.util.Vector; // ///** // * This class illustrates how to use ANNIE as a sausage machine // * in another application - put ingredients in one end (URLs pointing // * to documents) and get sausages (e.g. Named Entities) out the // * other end. // * <P><B>NOTE:</B><BR> // * For simplicity's sake, we don't do any exception handling. // */ //public class GateTextAnalyser { // // /** The Corpus Pipeline application to contain ANNIE */ // private CorpusController annieController; // // /** // * Initialise the ANNIE system. This creates a "corpus pipeline" // * application that can be used to run sets of documents through // * the extraction system. // */ // public void initAnnie() throws GateException, IOException { // Out.prln("Initialising ANNIE..."); // // // load the ANNIE application from the saved state in plugins/ANNIE // File pluginsHome = Gate.getPluginsHome(); // File anniePlugin = new File(pluginsHome, "ANNIE"); // File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp"); // annieController = // (CorpusController) PersistenceManager.loadObjectFromFile(annieGapp); // // Out.prln("...ANNIE loaded"); // } // initAnnie() // // /** Tell ANNIE's controller about the corpus you want to run on */ // public void setCorpus(Corpus corpus) { // annieController.setCorpus(corpus); // } // setCorpus // // /** Run ANNIE */ // public void execute() throws GateException { // Out.prln("Running ANNIE..."); // annieController.execute(); // Out.prln("...ANNIE complete"); // } // execute() // // /** // * Run from the command-line, with a list of URLs as argument. // * <P><B>NOTE:</B><BR> // * This code will run with all the documents in memory - if you // * want to unload each from memory after use, add code to store // * the corpus in a DataStore. // */ // public static void main(String args[]) throws GateException, IOException { // // initialise the GATE library // Out.prln("Initialising GATE..."); // Gate.init(); // Out.prln("...GATE initialised"); // // // initialise ANNIE (this may take several minutes) // GateTextAnalyser annie = new GateTextAnalyser(); // annie.initAnnie(); // // // create a GATE corpus and add a document for each command-line // // argument // Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); // for(int i = 0; i < args.length; i++) { // URL u = new URL(args[i]); // FeatureMap params = Factory.newFeatureMap(); // params.put("sourceUrl", u); // params.put("preserveOriginalContent", new Boolean(true)); // params.put("collectRepositioningInfo", new Boolean(true)); // Out.prln("Creating doc for " + u); // Document doc = (Document) // Factory.createResource("gate.corpora.DocumentImpl", params); // corpus.add(doc); // } // for each of args // // // tell the pipeline about the corpus and run it // annie.setCorpus(corpus); // annie.execute(); // // // for each document, get an XML document with the // // person and location names added // Iterator iter = corpus.iterator(); // int count = 0; // String startTagPart_1 = "<span GateID=\""; // String startTagPart_2 = "\" title=\""; // String startTagPart_3 = "\" style=\"background:Red;\">"; // String endTag = "</span>"; // // while(iter.hasNext()) { // Document doc = (Document) iter.next(); // AnnotationSet defaultAnnotSet = doc.getAnnotations(); // Set annotTypesRequired = new HashSet(); // annotTypesRequired.add("Person"); // annotTypesRequired.add("Location"); // Set<Annotation> peopleAndPlaces = // new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); // // FeatureMap features = doc.getFeatures(); // String originalContent = (String) // features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); // RepositioningInfo info = (RepositioningInfo) // features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); // // ++count; // File file = new File("StANNIE_" + count + ".HTML"); // Out.prln("File name: '"+file.getAbsolutePath()+"'"); // if(originalContent != null && info != null) { // Out.prln("OrigContent and reposInfo existing. Generate file..."); // // Iterator it = peopleAndPlaces.iterator(); // Annotation currAnnot; // SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); // // while(it.hasNext()) { // currAnnot = (Annotation) it.next(); // sortedAnnotations.addSortedExclusive(currAnnot); // } // while // // StringBuffer editableContent = new StringBuffer(originalContent); // long insertPositionEnd; // long insertPositionStart; // // insert anotation tags backward // Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); // Out.prln("Sorted annotations count: "+sortedAnnotations.size()); // for(int i=sortedAnnotations.size()-1; i>=0; --i) { // currAnnot = (Annotation) sortedAnnotations.get(i); // insertPositionStart = // currAnnot.getStartNode().getOffset().longValue(); // insertPositionStart = info.getOriginalPos(insertPositionStart); // insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); // insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); // if(insertPositionEnd != -1 && insertPositionStart != -1) { // editableContent.insert((int)insertPositionEnd, endTag); // editableContent.insert((int)insertPositionStart, startTagPart_3); // editableContent.insert((int)insertPositionStart, // currAnnot.getType()); // editableContent.insert((int)insertPositionStart, startTagPart_2); // editableContent.insert((int)insertPositionStart, // currAnnot.getId().toString()); // editableContent.insert((int)insertPositionStart, startTagPart_1); // } // if // } // for // // FileWriter writer = new FileWriter(file); // writer.write(editableContent.toString()); // writer.close(); // } // if - should generate // else if (originalContent != null) { // Out.prln("OrigContent existing. Generate file..."); // // Iterator it = peopleAndPlaces.iterator(); // Annotation currAnnot; // SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); // // while(it.hasNext()) { // currAnnot = (Annotation) it.next(); // sortedAnnotations.addSortedExclusive(currAnnot); // } // while // // StringBuffer editableContent = new StringBuffer(originalContent); // long insertPositionEnd; // long insertPositionStart; // // insert anotation tags backward // Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); // Out.prln("Sorted annotations count: "+sortedAnnotations.size()); // for(int i=sortedAnnotations.size()-1; i>=0; --i) { // currAnnot = (Annotation) sortedAnnotations.get(i); // insertPositionStart = // currAnnot.getStartNode().getOffset().longValue(); // insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); // if(insertPositionEnd != -1 && insertPositionStart != -1) { // editableContent.insert((int)insertPositionEnd, endTag); // editableContent.insert((int)insertPositionStart, startTagPart_3); // editableContent.insert((int)insertPositionStart, // currAnnot.getType()); // editableContent.insert((int)insertPositionStart, startTagPart_2); // editableContent.insert((int)insertPositionStart, // currAnnot.getId().toString()); // editableContent.insert((int)insertPositionStart, startTagPart_1); // } // if // } // for // // FileWriter writer = new FileWriter(file); // writer.write(editableContent.toString()); // writer.close(); // } // else { // Out.prln("Content : "+originalContent); // Out.prln("Repositioning: "+info); // } // // String xmlDocument = doc.toXml(peopleAndPlaces, false); // String fileName = new String("StANNIE_toXML_" + count + ".HTML"); // FileWriter writer = new FileWriter(fileName); // writer.write(xmlDocument); // writer.close(); // // } // for each doc // } // main // // /** // * // */ // public static class SortedAnnotationList extends Vector { // public SortedAnnotationList() { // super(); // } // SortedAnnotationList // // public boolean addSortedExclusive(Annotation annot) { // Annotation currAnot = null; // // // overlapping check // for (int i=0; i<size(); ++i) { // currAnot = (Annotation) get(i); // if(annot.overlaps(currAnot)) { // return false; // } // if // } // for // // long annotStart = annot.getStartNode().getOffset().longValue(); // long currStart; // // insert // for (int i=0; i < size(); ++i) { // currAnot = (Annotation) get(i); // currStart = currAnot.getStartNode().getOffset().longValue(); // if(annotStart < currStart) { // insertElementAt(annot, i); // /* // Out.prln("Insert start: "+annotStart+" at position: "+i+" size="+size()); // Out.prln("Current start: "+currStart); // */ // return true; // } // if // } // for // // int size = size(); // insertElementAt(annot, size); ////Out.prln("Insert start: "+annotStart+" at size position: "+size); // return true; // } // addSorted // } // SortedAnnotationList // } // class StandAloneAnnie