package context.core.task.entitydetection; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.entity.TabularData; import context.core.util.CorpusAggregator; import context.core.util.ForAggregation; import context.core.util.JavaIO; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author Aale */ public class EntityDetectionBody { /** * @param args * */ private EntityDetectionTaskInstance instance; private CorpusData input; private List<TabularData> tabularOutput; private AbstractSequenceClassifier<?> classifier3; private AbstractSequenceClassifier<?> classifier4; private AbstractSequenceClassifier<?> classifier7; // private File stopFile; List<String[]> entitiesWithCount; /** * * @param instance */ public EntityDetectionBody(EntityDetectionTaskInstance instance) { // TODO Auto-generated method stub this.instance = instance; init(); } private void init() { this.input = (CorpusData) instance.getInput(); this.tabularOutput = instance.getTabularOutput(); this.classifier3 = instance.get3Classifier(); this.classifier4 = instance.get4Classifier(); this.classifier7 = instance.get7Classifier(); System.out.println("init done successfully"); } /** * * @return */ public boolean detectEntities() { System.out.println("start detectEntities..."); List<FileData> files = input.getFiles(); List<List<String[]>> toAggregate = new ArrayList<List<String[]>>(); try { for (FileData ff : files) { File file = ff.getFile(); String text; try { text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); //System.out.println("Harathi in entity:"+text); List<String[]> longEntities = new ArrayList<String[]>(); List<ForAggregation> longEntities3 = new ArrayList<ForAggregation>(); List<ForAggregation> longEntities4 = new ArrayList<ForAggregation>(); List<ForAggregation> longEntities7 = new ArrayList<ForAggregation>(); MultiWordEntities MWE3 = MultiWordEntityRecognition(classifier3, text); MultiWordEntities MWE4 = MultiWordEntityRecognition(classifier4, text); MultiWordEntities MWE7 = MultiWordEntityRecognition(classifier7, text); longEntities3.addAll(MWE3.forAgg); //longEntities4.addAll(MWE4.forAgg); longEntities7.addAll(MWE7.forAgg); HashMap<String, Integer[]> Entities = new HashMap<String, Integer[]>(); for (int entityIndex = 0; entityIndex < longEntities3.size(); entityIndex++) { Integer[] offsetArray = {MWE3.startInd.get(entityIndex)}; Entities.put(longEntities3.get(entityIndex).toAggregate[0], offsetArray); longEntities.add(longEntities3.get(entityIndex).toAggregate); } // The following code is to incorporate another NER library, but it was causing // problems for large individual documents, so for now "longEntities4.addAll(MWE4.forAgg);" //has been commented out. In the future, // a check on file size may be done to switch between using the library or not. for (int entityIndex = 0; entityIndex < longEntities4.size(); entityIndex++) { if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities4.get(entityIndex).toAggregate[0])).contains(MWE4.startInd.get(entityIndex))) { continue; } else if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0])) { Integer[] numOfOcc = Entities.get(longEntities4.get(entityIndex).toAggregate[0]); Integer[] offsetArray4 = new Integer[numOfOcc.length + 1]; offsetArray4 = Arrays.copyOf(Entities.get(longEntities4.get(entityIndex).toAggregate[0]), offsetArray4.length); offsetArray4[offsetArray4.length - 1] = MWE4.startInd.get(entityIndex); Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray4); //System.out.println("Harathi in first if longentities:"+longEntities4.get(entityIndex).toAggregate[0]); //System.out.println("Harathi in first if offset:"+offsetArray4); longEntities.add(longEntities4.get(entityIndex).toAggregate); } else { Integer[] offsetArray = {MWE4.startInd.get(entityIndex)}; Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray); // System.out.println("Harathi in first else longentities:"+longEntities4.get(entityIndex).toAggregate[0]); // System.out.println("Harathi in first else offset:"+offsetArray); longEntities.add(longEntities4.get(entityIndex).toAggregate); } } for (int entityIndex = 0; entityIndex < longEntities7.size(); entityIndex++) { if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities7.get(entityIndex).toAggregate[0])).contains(MWE7.startInd.get(entityIndex))) { continue; } else if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0])) { Integer[] numOfOcc = Entities.get(longEntities7.get(entityIndex).toAggregate[0]); Integer[] offsetArray7 = new Integer[numOfOcc.length + 1]; offsetArray7 = Arrays.copyOf(Entities.get(longEntities7.get(entityIndex).toAggregate[0]), offsetArray7.length); offsetArray7[offsetArray7.length - 1] = MWE7.startInd.get(entityIndex); Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray7); //System.out.println("Harathi in second if longentities:"+longEntities7.get(entityIndex).toAggregate[0]); //System.out.println("Harathi in secondif offset:"+offsetArray7); longEntities.add(longEntities7.get(entityIndex).toAggregate); } else { Integer[] offsetArray = {MWE7.startInd.get(entityIndex)}; Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray); //System.out.println("Harathi in second else longentities:"+longEntities7.get(entityIndex).toAggregate[0]); //System.out.println("Harathi in second else offset:"+offsetArray); longEntities.add(longEntities7.get(entityIndex).toAggregate); } } toAggregate.add(longEntities); } catch (IOException e) { e.printStackTrace(); return false; } } //List<String[]> EntitiesToBeRemoved = new ArrayList<String[]>(); List<String[]> entitiesWithCount = new CorpusAggregator().CorpusAggregate(toAggregate); /* for (String[] entityWithCount : entitiesWithCount) { // try { if (entityWithCount[0].split(" ").length == 1 // && JavaIO.readFile(stopFile).contains(entityWithCount[0].toLowerCase()) ) { EntitiesToBeRemoved.add(entityWithCount); } // } catch (IOException e) { // e.printStackTrace(); // return false; // } } for (String[] entityWithCount : EntitiesToBeRemoved) { entitiesWithCount.remove(entityWithCount); } */ for (int i1 = 0; i1 < entitiesWithCount.size(); i1++) { String findStr = entitiesWithCount.get(i1)[0]; int count = 0; for (FileData ff : files) { File file = ff.getFile(); String text; text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); String str = text; int lastIndex = 0; while(lastIndex != -1){ lastIndex = str.indexOf(findStr,lastIndex); if(lastIndex != -1){ count ++; lastIndex += findStr.length(); } } } entitiesWithCount.get(i1)[2] = count + ""; System.out.println("word: " + entitiesWithCount.get(i1)[0]+ " frequency: "+entitiesWithCount.get(i1)[2]+ " our count: "+count); } this.entitiesWithCount = entitiesWithCount; } catch (Exception e) { e.printStackTrace(); return false; } return true; } private MultiWordEntities MultiWordEntityRecognition(AbstractSequenceClassifier<?> classifier, String inText) { List<ForAggregation> NamedEntities = new ArrayList<ForAggregation>(); String htmlString = classifier.classifyToString(inText, "inlineXML", true); Pattern tags = Pattern.compile("<.+?>.+?</.+?>"); Pattern tempTags = null; Matcher matcher = tags.matcher(htmlString); Matcher tempMatcher = null; List<Integer> startIndicies = new ArrayList<Integer>(); HashMap<String, Integer> hashedNumOcc = new HashMap<String, Integer>(); while (matcher.find()) { String name = (matcher.group().replaceAll("<.+?>", "")); /* if (name.split("\\s+").length<2){ continue; } */ String[] NamedEntity_array = {name.trim().replaceAll(" +", " "), matcher.group().replaceAll("<", "").replaceAll(">.+", "")}; if (hashedNumOcc.containsKey(name)) { hashedNumOcc.put(name, hashedNumOcc.get(name) + 1); } else { hashedNumOcc.put(name, 1); } ForAggregation NamedEntity = new ForAggregation(NamedEntity_array); startIndicies.add(findNthIndexOf(inText, name, hashedNumOcc.get(name))); if (null != NamedEntity) { NamedEntities.add(NamedEntity); } } MultiWordEntities toReturn = new MultiWordEntities(NamedEntities, startIndicies); return toReturn; } private int findNthIndexOf(String str, String needle, int occurence) throws IndexOutOfBoundsException { int index = -1; Pattern p = Pattern.compile(needle, Pattern.MULTILINE); Matcher m = p.matcher(str); while (m.find()) { if (--occurence == 0) { index = m.start(); break; } } if (index < 0) { // throw new IndexOutOfBoundsException(); return 0; } return index; } /** * * @param filepath */ public void writeOutput(String filepath) { //Write CSV this.writeCsv(entitiesWithCount, filepath); } /** * * @param entitiesWithCount * @param filePath */ public static void writeCsv(List<String[]> entitiesWithCount, String filePath) { System.out.println("size of entitiesWithCount=" + entitiesWithCount.size()); StringBuffer sb = new StringBuffer(); sb.append("Term, Entity, Frequency\n"); String toWrite = ""; for (int i1 = 0; i1 < entitiesWithCount.size(); i1++) { toWrite = entitiesWithCount.get(i1)[0].replaceAll("[^A-Za-z0-9\\. ]", "_") + "," + entitiesWithCount.get(i1)[1] + "," + (entitiesWithCount.get(i1)[2]) + "\n"; sb.append(toWrite); } // 2016.03 Add this code to delete existing file File toDelete = new File(filePath); if (toDelete.exists()) { toDelete.delete(); } FileData.writeDataIntoFile(sb.toString(), filePath); } }