/* * StandAloneAnnie.java */ package wikidf; import java.util.*; import java.io.*; import java.net.*; import gate.*; import gate.creole.*; import gate.util.*; import gate.corpora.RepositioningInfo; /** * This class illustrates how to use ANNIE as a sausage machine * in another application - put ingredients in one end (URLs pointing * to documents) and get sausages (e.g. Named Entities) out the * other end. * <P><B>NOTE:</B><BR> * For simplicity's sake, we don't do any exception handling. */ public class StandAloneAnnie { /** The Corpus Pipeline application to contain ANNIE */ private SerialAnalyserController annieController; /** * Initialise the ANNIE system. This creates a "corpus pipeline" * application that can be used to run sets of documents through * the extraction system. */ public void initAnnie() throws GateException { Out.prln("Initialising ANNIE..."); // create a serial analyser controller to run ANNIE with annieController = (SerialAnalyserController) Factory.createResource( "gate.creole.SerialAnalyserController", Factory.newFeatureMap(), Factory.newFeatureMap(), "ANNIE_" + Gate.genSym() ); // load each PR as defined in ANNIEConstants for(int i = 0; i < ANNIEConstants.PR_NAMES.length; i++) { FeatureMap params = Factory.newFeatureMap(); // use default parameters String stmp = ANNIEConstants.PR_NAMES[i]; ProcessingResource pr = (ProcessingResource) Factory.createResource(ANNIEConstants.PR_NAMES[i], params); // add the PR to the pipeline controller annieController.add(pr); } // for each ANNIE PR Out.prln("...ANNIE loaded"); } // initAnnie() /** Tell ANNIE's controller about the corpus you want to run on */ public void setCorpus(Corpus corpus) { annieController.setCorpus(corpus); } // setCorpus /** Run ANNIE */ public void execute() throws GateException { Out.prln("Running ANNIE..."); annieController.execute(); Out.prln("...ANNIE complete"); } // execute() /** * Run from the command-line, with a list of URLs as argument. * <P><B>NOTE:</B><BR> * This code will run with all the documents in memory - if you * want to unload each from memory after use, add code to store * the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { String[] texts = new String[2]; // long //texts[0] = "file:/mnt/win_e/all/projects/java/aot/gate/russian/embedRPOST/data/en/signatures_en.txt"; //texts[1] = "file:/mnt/win_e/all/projects/java/aot/gate/russian/embedRPOST/data/en/Common_Sense_Problem_Page.txt"; // short texts[0] = "file:/mnt/win_e/projects/java/aot/rupostagger/data/en/signatures_en_short.txt"; texts[1] = "file:/mnt/win_e/projects/java/aot/rupostagger/data/ru/ABS_zmldks_short.txt"; args = texts; // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); // Load ANNIE plugin File gateHome = Gate.getGateHome(); File pluginsHome = new File(gateHome, "plugins"); Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "ANNIE").toURI().toURL()); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = (Corpus) Factory.createResource("gate.corpora.CorpusImpl"); for(int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while(iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); AnnotationSet peopleAndPlaces = defaultAnnotSet.get(annotTypesRequired); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '"+file.getAbsolutePath()+"'"); if(originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while(it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); Out.prln("Sorted annotations count: "+sortedAnnotations.size()); for(int i=sortedAnnotations.size()-1; i>=0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if(insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int)insertPositionEnd, endTag); editableContent.insert((int)insertPositionStart, startTagPart_3); editableContent.insert((int)insertPositionStart, currAnnot.getType()); editableContent.insert((int)insertPositionStart, startTagPart_2); editableContent.insert((int)insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int)insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : "+originalContent); Out.prln("Repositioning: "+info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); // do something usefull with the XML here! Out.prln("'"+xmlDocument+"'"); } // for each doc } // main public static class SortedAnnotationList extends Vector { public SortedAnnotationList() { super(); } // SortedAnnotationList public boolean addSortedExclusive(Annotation annot) { Annotation currAnot = null; // overlapping check for (int i=0; i<size(); ++i) { currAnot = (Annotation) get(i); if(annot.overlaps(currAnot)) { return false; } // if } // for long annotStart = annot.getStartNode().getOffset().longValue(); long currStart; // insert for (int i=0; i < size(); ++i) { currAnot = (Annotation) get(i); currStart = currAnot.getStartNode().getOffset().longValue(); if(annotStart < currStart) { insertElementAt(annot, i); /* Out.prln("Insert start: "+annotStart+" at position: "+i+" size="+size()); Out.prln("Current start: "+currStart); */ return true; } // if } // for int size = size(); insertElementAt(annot, size); //Out.prln("Insert start: "+annotStart+" at size position: "+size); return true; } // addSorted } // SortedAnnotationList } // class StandAloneAnnie