package com.cse10.extractor.stanfordcorenlp; import com.cse10.article.Article; import com.cse10.article.CrimeArticle; import com.cse10.database.DatabaseHandler; import com.cse10.entities.CrimeEntityGroup; import com.cse10.extractor.stanfordcorenlp.detector.*; import edu.stanford.nlp.dcoref.CorefChain; import edu.stanford.nlp.dcoref.CorefCoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import java.util.*; /** * Created by TharinduWijewardane on 2015-01-01. */ public class Controller { private static StanfordCoreNLP pipeline; public static void main(String[] args) { List<Article> crimeArticles = loadCrimeArticles(); // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); pipeline = new StanfordCoreNLP(props); for (Article crimeArticle : crimeArticles) { System.out.println("---------------------- NEW ARTICLE ----------------------"); extractEntityGroup((CrimeArticle) crimeArticle); } } private static void extractEntityGroup(CrimeArticle crimeArticle) { String text = crimeArticle.getContent(); text = ContentFilter.manuelFilter(text); System.out.println(text); System.out.println("---"); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); CrimeEntityGroup crimeEntityGroup = buildCrimeEntityGroup(sentences); // This is the coreference link graph // Each chain stores a set of mentions that link to each other, // along with a method for getting the most representative mention // Both sentence and token offsets start at 1! Map<Integer, CorefChain> graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class); System.out.println("graph: "); System.out.println(graph); saveCrimeEntityGroup(crimeEntityGroup); } private static CrimeEntityGroup buildCrimeEntityGroup(List<CoreMap> sentences) { //to manage multiple entities extracted from different sentences ArrayList<String> crimeTypes = new ArrayList<String>(); ArrayList<String> criminals = new ArrayList<String>(); ArrayList<String> victims = new ArrayList<String>(); ArrayList<String> locations = new ArrayList<String>(); ArrayList<String> policeStations = new ArrayList<String>(); ArrayList<String> courts = new ArrayList<String>(); ArrayList<String> possessions = new ArrayList<String>(); ArrayList<String> prisons = new ArrayList<String>(); for (CoreMap sentence : sentences) { if (CrimeSentenceDetector.detectCrimeSentence(sentence)) { //if a crime sentence List<CoreLabel> tokens = new ArrayList<CoreLabel>(); // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { // this is the text of the token String word = token.get(CoreAnnotations.TextAnnotation.class); System.out.println("\nword: " + word); // this is the POS tag of the token String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); System.out.println("pos: " + pos); // this is the NER label of the token String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); System.out.println("ne: " + ne); String lem = token.get(CoreAnnotations.LemmaAnnotation.class); System.out.println("lem: " + lem); tokens.add(token); } System.out.println(sentence); // to filter out empty strings checkAndAdd(crimeTypes, CrimeTypeDetector.findCrimeType(tokens)); checkAndAdd(criminals, CriminalDetector.findCriminal(tokens)); checkAndAdd(victims, VictimDetector.findVictim(tokens)); checkAndAdd(locations, LocationDetector.findLocation(tokens)); checkAndAdd(policeStations, PoliceStationDetector.findPolice(tokens)); checkAndAdd(courts, CourtDetector.findCourt(tokens)); checkAndAdd(possessions, PossessionDetector.findPossession(tokens)); checkAndAdd(prisons, PrisonDetector.findPrison(tokens)); } // // this is the parse tree of the current sentence // Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); // tree.pennPrint(); // // // this is the Stanford dependency graph of the current sentence // SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class); // System.out.println("SemanticGraph: "); // dependencies.prettyPrint(); } CrimeEntityGroup crimeEntityGroup = new CrimeEntityGroup(); // to get most common value (obtained from different sentences) crimeEntityGroup.setCrimeType(getMostCommonElement(crimeTypes)); crimeEntityGroup.setCriminal(getMostCommonElement(criminals)); crimeEntityGroup.setVictim(getMostCommonElement(victims)); crimeEntityGroup.setLocation(getMostCommonElement(locations)); crimeEntityGroup.setPolice(getMostCommonElement(policeStations)); crimeEntityGroup.setCourt(getMostCommonElement(courts)); crimeEntityGroup.setPossession(getMostCommonElement(policeStations)); // prison? return crimeEntityGroup; } private static void saveCrimeEntityGroup(CrimeEntityGroup crimeEntityGroup) { DatabaseHandler.insertCrimeEntities(crimeEntityGroup); } private static List<Article> loadCrimeArticles() { // List<Article> crimeArticles = DatabaseHandler.fetchArticles(CrimeArticle.class); //TODO List<Article> crimeArticles = DatabaseHandler.fetchArticlesByIdRange(CrimeArticle.class, 151, 160); // for now return crimeArticles; } private static void checkAndAdd(ArrayList<String> list, String keyword) { // for filtering out null and empty values if (keyword != null && keyword.length() > 0) { list.add(keyword); } } private static String getMostCommonElement(ArrayList<String> list) { Collections.sort(list); String mostCommon = null; String last = null; int mostCount = 0; int lastCount = 0; for (String x : list) { System.out.println("in list: " + x); if (x.equalsIgnoreCase(last)) { lastCount++; } else if (lastCount > mostCount) { mostCount = lastCount; mostCommon = last; } last = x; } if (mostCommon == null) { mostCommon = last; } System.out.println("most common: " + mostCommon); return mostCommon; } }