package com.cse10.extractor.gate; /** * Created with IntelliJ IDEA. * User: Isuru Jayaweera * Date: 01/26/15 * Extract entities from crime news articles and add those entities to a separate table. */ import com.cse10.article.Article; import com.cse10.article.CrimeArticle; import com.cse10.database.DatabaseHandler; import com.cse10.entities.CrimeEntityGroup; import com.cse10.entities.LocationDistrictMapper; import gate.*; import gate.annotation.AnnotationImpl; import gate.util.GateException; import gate.util.persistence.PersistenceManager; import org.apache.log4j.Logger; import org.hibernate.ObjectNotFoundException; import org.hibernate.exception.DataException; import java.io.*; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; public class EntityExtractor extends Observable { // path to the saved application file. private File gappFile; // path to the configuration file containing ID of the last extracted entity. private File configFile; // list of annotation types to write out. If null, write everything as GateXML. private List annotTypesToWrite; // fetch district name from google map api response private DistrictExtractor de; // stores entities temporary till they are inserted to the table private ArrayList<CrimeEntityGroup> entityGroupsList; // ID of the last entity extracted article private int endID; // declare logger private Logger logger; // constructor EntityExtractor(){ gappFile = new File("Extractor/src/main/resources/Complete_v1.gapp"); configFile = new File("Extractor/src/main/resources/Configuration.txt"); annotTypesToWrite = new ArrayList<>(Arrays.asList("CrimeLocation", "ArticleType", "Police", "Court", "CrimeDate", "CrimePerson")); logger = Logger.getLogger(this.getClass()); de = new DistrictExtractor(); entityGroupsList = new ArrayList<>(); } public synchronized boolean startExtraction() throws InterruptedException, IOException, GateException, ParseException { // to check whether execution was successful or not boolean isSuccessful = true; //get ID of the article to start entity extraction. int startID = getLastID(); //set ID of the last entity extracted article to starting article. endID = startID; // setting gate.home variable File gateHome = getGATEHome(); // initialise GATE Gate.init(); // load the saved application CorpusController application; application = (CorpusController) PersistenceManager.loadObjectFromFile(gappFile); Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); // fetches news articles from database // List<Article> articles = DatabaseHandler.fetchArticles(CrimeArticle.class); List<Article> articles = DatabaseHandler.fetchArticlesByIdStarting(CrimeArticle.class,startID+1); ArrayList<CrimeEntityGroup> resultsList; resultsList = executeProcessPipeline(articles, corpus, application); if(resultsList != null && !resultsList.isEmpty()){ isSuccessful = true; } DatabaseHandler.closeDatabase(); logger.info("All done"); return isSuccessful; } // method to fetch district for the location using google map api, unless it is in the location - district // mapping table public void resolveLocation(String location, CrimeEntityGroup entityGroupOfArticle, int articleID) { LocationDistrictMapper locationDistrict; String district = "NULL"; try { // try to retrieve district of the location from the location - district mapping table locationDistrict = DatabaseHandler.fetchLocation(location); district = locationDistrict.getDistrict(); entityGroupOfArticle.setCrimeArticleId(articleID); entityGroupOfArticle.setLocation(location); entityGroupOfArticle.setLocationDistrict(locationDistrict); } catch (ObjectNotFoundException e) { // unless district is present in the location - district mapping table // fetch district for the location using google map api and relevant location - district mapping data into // the location - district mapping table for future reference district = de.getDistrict(location); if (!district.equalsIgnoreCase("NULL")) { locationDistrict = new LocationDistrictMapper(location, district); try { DatabaseHandler.insertLocationDistrict(locationDistrict); entityGroupOfArticle.setCrimeArticleId(articleID); entityGroupOfArticle.setLocation(location); entityGroupOfArticle.setLocationDistrict(locationDistrict); }catch (DataException dataE){ logger.info("Long district name : "+district+" for location : "+location); district = null; } } } } // get ID of the crime article which was processed with entity extraction process private int getLastID() throws InterruptedException{ int theID = 0; BufferedReader br = null; try { String sCurrentLine; br = new BufferedReader(new FileReader(configFile)); if ((sCurrentLine = br.readLine()) != null) { theID = Integer.parseInt(sCurrentLine); } } catch (IOException e) { logger.info("Configuration File Not Found : ",e); DatabaseHandler.closeDatabase(); throw new InterruptedException("Thread interruption forced."); } finally { try { if (br != null)br.close(); } catch (IOException ex) { ex.printStackTrace(); } } return theID; } // write the last entity extracted article ID to the configuration file private void writeLastID(){ FileWriter fooWriter = null; try { // false to overwrite. fooWriter = new FileWriter(configFile, false); fooWriter.write(String.valueOf(endID)); } catch (IOException e) { e.printStackTrace(); }finally { try { fooWriter.close(); } catch (IOException ex) { ex.printStackTrace(); } } } // get and set GATEHOME property to access GATE FrameWork private File getGATEHome() throws InterruptedException{ String homePath = null; File gateHome; if (Gate.getGateHome() == null) { // get GATEHOME from system variables homePath = System.getenv("GATE_HOME"); if (homePath == null) { System.out.print("Enter GATE Home path : "); BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); try { homePath = br.readLine(); } catch (IOException e) { logger.info("Incorrect Path"); DatabaseHandler.closeDatabase(); throw new InterruptedException("Thread interruption forced."); } } }else{ homePath = Gate.getGateHome().getPath(); } // check whether GATEHOME is correct or not File pathCheck = new File(homePath + "\\gate.xml"); if (pathCheck.isFile() && Gate.getGateHome() == null) { gateHome = new File(homePath); Gate.setGateHome(gateHome); logger.info("GATE Home Configured : " + Gate.getGateHome()); } else if(!pathCheck.isFile() && Gate.getGateHome() == null) { logger.info("GATE Home Path Incorrect : "+homePath); DatabaseHandler.closeDatabase(); throw new InterruptedException("Thread interruption forced."); } else { gateHome = new File(homePath); } return gateHome; } protected ArrayList<CrimeEntityGroup> executeProcessPipeline(List<Article> articles, Corpus corpus, CorpusController application) throws InterruptedException, GateException, ParseException { // number of articles has to be entity extracted to progress the progress bar by 1 step. int uiStepSize = articles.size()/100; // progress of the entity extraction process int currentProgress = 1; // list of extracted entity groups ArrayList<CrimeEntityGroup> crimeEntityGroupList = new ArrayList<CrimeEntityGroup>(); // process the files one by one for (int i = 0; i < articles.size(); i++) { Article currentArticle = articles.get(i); String articleLabel; try { articleLabel = currentArticle.getLabel(); } catch (NullPointerException e) { continue; } if (articleLabel != null && articleLabel.equalsIgnoreCase("crime")) { String articleContent; Date articleDate; DateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.ENGLISH); try { articleContent = currentArticle.getContent(); articleDate = currentArticle.getCreatedDate(); } catch (NullPointerException e) { continue; } // append created date to the processing document articleContent = articleContent + ".::" + articleDate + "::."; int articleLength = articleContent.length(); logger.info("New Article size : " + articleLength); if (articleLength > 2500) { articleContent = currentArticle.getTitle(); } // load the document Document doc = Factory.newDocument(articleContent); // put the document in the corpus corpus.add(doc); // run the application application.execute(); // remove the document from the corpus again corpus.clear(); Set annotationsToWrite = new HashSet(); // extracting the annotations into a Set if (annotTypesToWrite != null) { // extracting annotations from the default AnnotationSet AnnotationSet defaultAnnots = doc.getAnnotations(); Iterator annotTypesIt = annotTypesToWrite.iterator(); while (annotTypesIt.hasNext()) { // extracting all the annotations of each requested type and add them to // the temporary set AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next()); if (annotsOfThisType != null) { annotationsToWrite.addAll(annotsOfThisType); } } } // Release the document Factory.deleteResource(doc); logger.info("Article : " + i + " -Begins Here-"); // crime entity details String district = "NULL"; String location = "NULL"; String police = "NULL"; String court = "NULL"; String crimeType = "other"; String crimePeople = "NULL"; HashSet<String> crimePeopleSet = new HashSet<>(); int articleID = currentArticle.getId(); Date crimeDate = articleDate; LocationDistrictMapper locationDistrict; CrimeEntityGroup entityGroupOfArticle = new CrimeEntityGroup(); // iterate through each annotation Iterator annotIt = annotationsToWrite.iterator(); while (annotIt.hasNext()) { // extract all the annotations of each requested type and add them to // the temporary set AnnotationImpl CurrentAnnot = (AnnotationImpl) annotIt.next(); String antText = Utils.stringFor(doc, CurrentAnnot); // check for crime location annotation if (CurrentAnnot.getType().equalsIgnoreCase("CrimeLocation")) { location = antText; // fetch district for the location using google map api, unless it is in the location - district // mapping table resolveLocation(location, entityGroupOfArticle, articleID); } // check for crime type annotation and set crime type on crime entity details if (CurrentAnnot.getType().equalsIgnoreCase("ArticleType")) { try { crimeType = CurrentAnnot.getFeatures().get("article_type").toString(); entityGroupOfArticle.setCrimeType(crimeType); } catch (NullPointerException e) { logger.info("****** Not normalized : " + currentArticle.getTitle() + " **********"); } } // check for crime person annotation and add into a HashSet of crime people if (CurrentAnnot.getType().equalsIgnoreCase("CrimePerson")) { if (!crimePeopleSet.contains(antText)) { crimePeopleSet.add(antText); } } // check for police annotation and set police location on crime entity details if (CurrentAnnot.getType().equalsIgnoreCase("Police")) { police = antText; entityGroupOfArticle.setPolice(police); } // check for court annotation and set court location on crime entity details if (CurrentAnnot.getType().equalsIgnoreCase("Court")) { court = antText; entityGroupOfArticle.setCourt(court); } // check for crime date annotation, get normalized date and parse it to the required date format if (CurrentAnnot.getType().equalsIgnoreCase("CrimeDate")) { try { crimeDate = format.parse(CurrentAnnot.getFeatures().get("normalized").toString()); } catch (NullPointerException e) { logger.info("****** Not normalized : " + antText + " **********"); } } } if (entityGroupOfArticle.getLocationDistrict() == null && entityGroupOfArticle.getPolice() != null) { police = entityGroupOfArticle.getPolice(); // fetch district for the location using google map api, unless it is in the location - district // mapping table resolveLocation(police, entityGroupOfArticle, articleID); } if (entityGroupOfArticle.getLocationDistrict() == null && entityGroupOfArticle.getCourt() != null) { court = entityGroupOfArticle.getCourt(); // fetch district for the location using google map api, unless it is in the location - district // mapping table resolveLocation(court, entityGroupOfArticle, articleID); } if (entityGroupOfArticle.getLocationDistrict() != null) { // set crime date on crime entity details entityGroupOfArticle.setCrimeDate(crimeDate); // add to local list of crime entity sets crimeEntityGroupList.add(entityGroupOfArticle); // insert people involved in the crime to crime etity details and add crime entity and people // involved it into the DB DatabaseHandler.insertCrimeDetails(entityGroupOfArticle, crimePeopleSet); } endID = articleID; // check all crime details are properly entered logger.info("CrimeType : " + crimeType); logger.info("Crime Date : " + format.format(crimeDate)); logger.info("Article Title : " + currentArticle.getTitle()); logger.info("Crime Location : " + location); if (entityGroupOfArticle.getLocationDistrict() != null) { logger.info("District : " + entityGroupOfArticle.getLocationDistrict().getDistrict()); } logger.info("Crime People : " + crimePeople); logger.info("Police Location : " + police); logger.info("Court Location : " + court); logger.info("Article : " + i + " -Ends Here-"); logger.info(""); // check whether this thread is interrupted from out side if(Thread.interrupted()) { logger.info("Interruption Identified."); DatabaseHandler.closeDatabase(); throw new InterruptedException("Thread interruption forced."); } } // updating the progress of the entity extraction process if(uiStepSize != 0) { if (i % uiStepSize == 0) { logger.info("Progress updating."); currentProgress = i / uiStepSize; if(currentProgress < 100 && currentProgress >= 0) { setChanged(); notifyObservers(currentProgress); } } } }// for each article currentProgress = 100; setChanged(); notifyObservers(currentProgress); return crimeEntityGroupList; } // execute on interrupt or end of the process public synchronized boolean stopExtraction(){ writeLastID(); return true; } }