package com.cse10.gate; import gate.*; import gate.annotation.AnnotationImpl; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.GateException; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; /** * use to filter the content the filter * Created by Chamath on 12/20/2014. */ public class DocumentContentFilter { private Corpus corpus; private CorpusPipeLine corpusPipeLine; private List annotationTypesRequired; private boolean isGateHomeConfigured; private boolean isCorpusPipeLineConfigured; public DocumentContentFilter() { isGateHomeConfigured = false; isCorpusPipeLineConfigured = false; } /** * configure gate home */ private void configureGateHome() { if (!isGateHomeConfigured) { String homePath = "\\home"; File gateHome; if (Gate.getGateHome() == null) { homePath = System.getenv("GATE_HOME"); if (homePath == null) { //if environment variable is null, then prompt user to enter the path System.out.print("Enter GATE Home path : "); BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); try { homePath = br.readLine(); } catch (IOException e) { e.printStackTrace(); } } System.out.println("Gate Home Path= " + homePath); //check whether the provided path is correct File pathCheck = new File(homePath + "\\gate.xml"); if (pathCheck.exists()) { gateHome = new File(homePath); Gate.setGateHome(gateHome); System.out.println("GATE Home has been Configured : " + Gate.getGateHome()); } else { System.out.println("GATE Home Path is Incorrect"); System.exit(0); } } isGateHomeConfigured = true; } } /** * configure corpus pipe line */ private void configureCorpusPipeLine() { if (!isCorpusPipeLineConfigured) { try { configureGateHome(); Gate.init(); } catch (GateException ex) { Logger.getLogger(DocumentContentFilter.class.getName()).log(Level.SEVERE, null, ex); } annotationTypesRequired = new ArrayList(); ArrayList<String> annotationTypes = new ArrayList(); annotationTypes.add("Token"); ListIterator iter = annotationTypes.listIterator(); while (iter.hasNext()) { String annotation = (String) iter.next(); annotationTypesRequired.add(annotation); } try { // create corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); } catch (ResourceInstantiationException ex) { Logger.getLogger(DocumentContentFilter.class.getName()).log(Level.SEVERE, null, ex); } corpusPipeLine = new CorpusPipeLine(); corpusPipeLine.configure(true); isCorpusPipeLineConfigured = true; } } /** * get filtered content of the given document content * * @param content * @return */ public String getFilterdContent(String content) { Document doc; String filteredContent = ""; configureCorpusPipeLine(); try { doc = Factory.newDocument(content); // create new gate document corpus.add(doc); corpusPipeLine.setCorpus(corpus); try { corpusPipeLine.execute(); } catch (ExecutionException ex) { Logger.getLogger(DocumentContentFilter.class.getName()).log(Level.SEVERE, null, ex); } corpus.clear(); //temporary collection to add annotations of required type Set annotationsRequired = new HashSet(); /** * extract the required annotations into a Set */ if (this.annotationTypesRequired != null) { AnnotationSet defaultAnnotations = doc.getAnnotations(); Iterator annotationsTypesRequiredIterator = this.annotationTypesRequired.iterator(); while (annotationsTypesRequiredIterator.hasNext()) { /** * extract all the annotations of each required type and * add them to collection */ AnnotationSet annotationsOfThisType = defaultAnnotations.get((String) annotationsTypesRequiredIterator.next()); if (annotationsOfThisType != null) { annotationsRequired.addAll(annotationsOfThisType); } } } // Release the document, as it is no longer needed Factory.deleteResource(doc); Iterator annotationsRequiredIterator = annotationsRequired.iterator(); while (annotationsRequiredIterator.hasNext()) { AnnotationImpl currentAnnotation = (AnnotationImpl) annotationsRequiredIterator.next(); if (currentAnnotation.getType().equalsIgnoreCase("Token") && (currentAnnotation.getFeatures().get("category") == "NN" || currentAnnotation.getFeatures().get("category") == "NNS")) { filteredContent = filteredContent.concat(currentAnnotation.getFeatures().get("string").toString()); filteredContent = filteredContent.concat(" "); } else if (currentAnnotation.getType().equalsIgnoreCase("Token") && (currentAnnotation.getFeatures().get("category") == "JJ" || currentAnnotation.getFeatures().get("category") == "JJR" || currentAnnotation.getFeatures().get("category") == "JJS")) { filteredContent = filteredContent.concat(currentAnnotation.getFeatures().get("string").toString()); filteredContent = filteredContent.concat(" "); } else if (currentAnnotation.getType().equalsIgnoreCase("Token") && (currentAnnotation.getFeatures().get("category") == "RB" || currentAnnotation.getFeatures().get("category") == "RBR" || currentAnnotation.getFeatures().get("category") == "RBS")) { filteredContent = filteredContent.concat(currentAnnotation.getFeatures().get("string").toString()); filteredContent = filteredContent.concat(" "); } else if (currentAnnotation.getType().equalsIgnoreCase("Token") && (currentAnnotation.getFeatures().get("category").equals("VBD") || currentAnnotation.getFeatures().get("category") == "VBG" || currentAnnotation.getFeatures().get("category") == "VBN" || currentAnnotation.getFeatures().get("category") == "VBP" || currentAnnotation.getFeatures().get("category") == "VB" || currentAnnotation.getFeatures().get("category") == "VBZ")) { filteredContent = filteredContent.concat(currentAnnotation.getFeatures().get("string").toString()); filteredContent = filteredContent.concat(" "); } } } catch (ResourceInstantiationException ex) { } return filteredContent; } }