package edu.stanford.nlp.patterns; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.ArgumentParser; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.TypesafeMap; import edu.stanford.nlp.patterns.surface.*; import javax.json.*; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.lang.reflect.InvocationTargetException; import java.sql.SQLException; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.logging.Logger; /** * Created by sonalg on 3/10/15. */ public class TextAnnotationPatterns { Map<String, Class<? extends TypesafeMap.Key<String>>> humanLabelClasses = new HashMap<>(); Map<String, Class<? extends TypesafeMap.Key<String>>> machineAnswerClasses = new HashMap<>(); Properties props; String outputFile; Counter<String> matchedSeedWords; Map<String, Set<CandidatePhrase>> seedWords = new HashMap<>(); private String backgroundSymbol ="O"; //Properties testProps = new Properties(); Logger logger = Logger.getAnonymousLogger(); public TextAnnotationPatterns() throws IOException { // if(testPropertiesFile!= null && new File(testPropertiesFile).exists()){ // logger.info("Loading test properties from " + testPropertiesFile); // testProps.load(new FileReader(testPropertiesFile)); // } } public String getAllAnnotations() { JsonObjectBuilder obj = Json.createObjectBuilder(); for(Map.Entry<String, DataInstance> sent: Data.sents.entrySet()){ boolean sentHasLabel = false; JsonObjectBuilder objsent = Json.createObjectBuilder(); int tokenid = 0; for(CoreLabel l : sent.getValue().getTokens()){ boolean haslabel = false; JsonArrayBuilder labelArr = Json.createArrayBuilder(); for(Map.Entry<String, Class<? extends TypesafeMap.Key<String>>> en: this.humanLabelClasses.entrySet()){ if(!l.get(en.getValue()).equals(backgroundSymbol)){ haslabel = true; sentHasLabel = true; labelArr.add(en.getKey()); } } if(haslabel) objsent.add(String.valueOf(tokenid), labelArr); tokenid++; } if(sentHasLabel) obj.add(sent.getKey(), objsent); } return obj.build().toString(); } public String getAllAnnotations(String input) { JsonObjectBuilder objsent = Json.createObjectBuilder(); int tokenid = 0; for(CoreLabel l : Data.sents.get(input).getTokens()){ boolean haslabel = false; JsonArrayBuilder labelArr = Json.createArrayBuilder(); for(Map.Entry<String, Class<? extends TypesafeMap.Key<String>>> en: this.humanLabelClasses.entrySet()){ if(!l.get(en.getValue()).equals(backgroundSymbol)){ haslabel = true; labelArr.add(en.getKey()); } } if(haslabel) objsent.add(String.valueOf(tokenid), labelArr); tokenid++; } return objsent.build().toString(); } public String suggestPhrases() throws IOException, ClassNotFoundException, IllegalAccessException, InterruptedException, ExecutionException, InstantiationException, NoSuchMethodException, InvocationTargetException { resetPatternLabelsInSents(Data.sents); GetPatternsFromDataMultiClass<SurfacePattern> model = new GetPatternsFromDataMultiClass<>(props, Data.sents, seedWords, false, humanLabelClasses); //model.constVars.numIterationsForPatterns = 2; model.iterateExtractApply(); return model.constVars.getLearnedWordsAsJson(); } public String suggestPhrasesTest(Properties testProps, String modelPropertiesFile, String stopWordsFile) throws IllegalAccessException, InterruptedException, ExecutionException, IOException, InstantiationException, NoSuchMethodException, InvocationTargetException, ClassNotFoundException, SQLException { logger.info("Suggesting phrases in test"); logger.info("test properties are " + testProps); Properties runProps = StringUtils.argsToPropertiesWithResolve(new String[]{"-props",modelPropertiesFile}); String[] removeProperties = new String[]{"allPatternsDir","storePatsForEachToken","invertedIndexClass","savePatternsWordsDir","batchProcessSents","outDir","saveInvertedIndex","removeOverLappingLabels","numThreads"}; for(String s: removeProperties) if(runProps.containsKey(s)) runProps.remove(s); runProps.setProperty("stopWordsPatternFiles",stopWordsFile); runProps.setProperty("englishWordsFiles", stopWordsFile); runProps.setProperty("commonWordsPatternFiles", stopWordsFile); runProps.putAll(props); runProps.putAll(testProps); props.putAll(runProps); processText(false); GetPatternsFromDataMultiClass<SurfacePattern> model = new GetPatternsFromDataMultiClass<>(runProps, Data.sents, seedWords, true, humanLabelClasses); ArgumentParser.fillOptions(model, runProps); GetPatternsFromDataMultiClass.loadFromSavedPatternsWordsDir(model , runProps); Map<String, Integer> alreadyLearnedIters = new HashMap<>(); for(String label: model.constVars.getLabels()) alreadyLearnedIters.put(label, model.constVars.getLearnedWordsEachIter().get(label).lastEntry().getKey()); if (model.constVars.learn) { // Map<String, E> p0 = new HashMap<String, SurfacePattern>(); // Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>(); // Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>(); model.iterateExtractApply(null, null, null); } Map<String, Counter<CandidatePhrase>> allExtractions = new HashMap<>(); //Only for one label right now! String label = model.constVars.getLabels().iterator().next(); allExtractions.put(label, new ClassicCounter<>()); for(Map.Entry<String, DataInstance> sent: Data.sents.entrySet()){ StringBuffer str = new StringBuffer(); for(CoreLabel l : sent.getValue().getTokens()){ if(l.get(PatternsAnnotations.MatchedPatterns.class) != null && !l.get(PatternsAnnotations.MatchedPatterns.class).isEmpty()){ str.append(" " + l.word()); }else{ allExtractions.get(label).incrementCount(CandidatePhrase.createOrGet(str.toString().trim())); str.setLength(0); } } } allExtractions.putAll(model.matchedSeedWords); return model.constVars.getSetWordsAsJson(allExtractions); } //label the sents with the labels provided by humans private void resetPatternLabelsInSents(Map<String, DataInstance> sents) { for(Map.Entry<String, DataInstance> sent: sents.entrySet()){ for(CoreLabel l : sent.getValue().getTokens()){ for(Map.Entry<String, Class<? extends TypesafeMap.Key<String>>> cl: humanLabelClasses.entrySet()){ l.set(machineAnswerClasses.get(cl.getKey()), l.get(cl.getValue())); } } } } public String getMatchedTokensByAllPhrases(){ return GetPatternsFromDataMultiClass.matchedTokensByPhraseJsonString(); } public String getMatchedTokensByPhrase(String input){ return GetPatternsFromDataMultiClass.matchedTokensByPhraseJsonString(input); } private void setProperties(Properties props){ if(!props.containsKey("fileFormat")) props.setProperty("fileFormat","txt"); if(!props.containsKey("learn")) props.setProperty("learn","false"); if(!props.containsKey("patternType")) props.setProperty("patternType","SURFACE"); props.setProperty("preserveSentenceSequence", "true"); if(!props.containsKey("debug")) props.setProperty("debug","3"); if(!props.containsKey("thresholdWordExtract")) props.setProperty("thresholdWordExtract","0.00000000000000001"); if(!props.containsKey("thresholdNumPatternsApplied")) props.setProperty("thresholdNumPatternsApplied", "1"); if(!props.containsKey("writeMatchedTokensIdsForEachPhrase")) props.setProperty("writeMatchedTokensIdsForEachPhrase","true"); } void setUpProperties(String line, boolean readFile, boolean writeOutputToFile, String additionalSeedWordsFiles) throws IOException, ClassNotFoundException { JsonReader jsonReader = Json.createReader(new StringReader(line)); JsonObject objarr = jsonReader.readObject(); jsonReader.close(); Properties props = new Properties(); for (String o : objarr.keySet()){ if(o.equals("seedWords")){ JsonObject obj = objarr.getJsonObject(o); for (String st : obj.keySet()){ seedWords.put(st, new HashSet<>()); JsonArray arr = obj.getJsonArray(st); for(int i = 0; i < arr.size(); i++){ String val = arr.getString(i); seedWords.get(st).add(CandidatePhrase.createOrGet(val)); System.out.println("adding " + val + " for label " + st); } } }else props.setProperty(o, objarr.getString(o)); } System.out.println("seedwords are " + seedWords); if(additionalSeedWordsFiles != null && !additionalSeedWordsFiles.isEmpty()) { Map<String, Set<CandidatePhrase>> additionalSeedWords = GetPatternsFromDataMultiClass.readSeedWords(additionalSeedWordsFiles); logger.info("additional seed words are " + additionalSeedWords); for (String label : seedWords.keySet()) { if(additionalSeedWords.containsKey(label)) seedWords.get(label).addAll(additionalSeedWords.get(label)); } } outputFile = null; if(readFile) { System.out.println("input value is " + objarr.getString("input")); outputFile = props.getProperty("input") + "_processed"; props.setProperty("file",objarr.getString("input")); if (writeOutputToFile && !props.containsKey("columnOutputFile")) props.setProperty("columnOutputFile", outputFile); } else{ String systemdir = System.getProperty("java.io.tmpdir"); File tempFile= File.createTempFile("sents", ".tmp", new File(systemdir)); tempFile.deleteOnExit(); IOUtils.writeStringToFile(props.getProperty("input"),tempFile.getPath(), "utf8"); props.setProperty("file", tempFile.getAbsolutePath()); } setProperties(props); this.props = props; int i = 1; for (String label : seedWords.keySet()) { String ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i; Class<? extends TypesafeMap.Key<String>> mcCl = (Class<? extends TypesafeMap.Key<String>>) Class.forName(ansclstr); machineAnswerClasses.put(label, mcCl); String humanansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternHumanLabel" + i; humanLabelClasses.put(label, (Class<? extends TypesafeMap.Key<String>>) Class.forName(humanansclstr)); i++; } } //the format of the line input is json string of maps. required keys are "input" and "seedWords". "input" can be a string or file (in which case readFile should be true.) // For example: {"input":"presidents.txt","seedWords":{"name":["Obama"],"place":["Chicago"]}} public String processText(boolean writeOutputToFile) throws IOException, InstantiationException, InvocationTargetException, ExecutionException, SQLException, InterruptedException, IllegalAccessException, ClassNotFoundException, NoSuchMethodException { logger.info("Starting to process text"); logger.info("all seed words are " + seedWords); Pair<Map<String, DataInstance>, Map<String, DataInstance>> sentsPair = GetPatternsFromDataMultiClass.processSents(props, seedWords.keySet()); Data.sents = sentsPair.first(); ConstantsAndVariables constVars = new ConstantsAndVariables(props, seedWords.keySet(), machineAnswerClasses); for (String label : seedWords.keySet()) { GetPatternsFromDataMultiClass.runLabelSeedWords(Data.sents, humanLabelClasses.get(label), label, seedWords.get(label), constVars, true); } if(writeOutputToFile){ GetPatternsFromDataMultiClass.writeColumnOutput(outputFile, false, humanLabelClasses); System.out.println("written the output to " + outputFile); } logger.info("Finished processing text"); return "SUCCESS"; } public String doRemovePhrases(String line){ return ("not yet implemented"); } public String doRemoveAnnotations(String line) { int tokensNum = changeAnnotation(line, true); return "SUCCESS . Labeled " + tokensNum + " tokens "; } //input is a json string, example:{“name”:[“sent1”:”1,2,4,6”,”sent2”:”11,13,15”], “birthplace”:[“sent1”:”3,5”]} public String doNewAnnotations(String line) { int tokensNum = changeAnnotation(line, false); return "SUCCESS . Labeled " + tokensNum + " tokens "; } private int changeAnnotation(String line, boolean remove){ int tokensNum = 0; JsonReader jsonReader = Json.createReader(new StringReader(line)); JsonObject objarr = jsonReader.readObject(); for(String label: objarr.keySet()) { JsonObject obj4label = objarr.getJsonObject(label); for(String sentid: obj4label.keySet()){ JsonArray tokenArry = obj4label.getJsonArray(sentid); for(JsonValue tokenid: tokenArry){ tokensNum ++; Data.sents.get(sentid).getTokens().get(Integer.valueOf(tokenid.toString())).set(humanLabelClasses.get(label), remove ? backgroundSymbol: label); } } } return tokensNum; } public String currentSummary(){ return "Phrases hand labeled : "+seedWords.toString(); } //line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]} public String doNewPhrases(String line) throws Exception { System.out.println("adding new phrases"); ConstantsAndVariables constVars = new ConstantsAndVariables(props, humanLabelClasses.keySet(), humanLabelClasses); JsonReader jsonReader = Json.createReader(new StringReader(line)); JsonObject objarr = jsonReader.readObject(); for(Map.Entry<String, JsonValue> o: objarr.entrySet()){ String label = o.getKey(); Set<CandidatePhrase> seed = new HashSet<>(); JsonArray arr = objarr.getJsonArray(o.getKey()); for(int i = 0; i < arr.size(); i++){ String seedw = arr.getString(i); System.out.println("adding " + seedw + " to seed "); seed.add(CandidatePhrase.createOrGet(seedw)); } seedWords.get(label).addAll(seed); constVars.addSeedWords(label, seed); GetPatternsFromDataMultiClass.runLabelSeedWords(Data.sents, humanLabelClasses.get(label), label, seed, constVars, false); //model.labelWords(label, labelclass, Data.sents, seed); } return "SUCCESS added new phrases"; } }