package edu.stanford.nlp.patterns; import java.io.*; import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.sql.SQLException; import java.text.DecimalFormat; import java.time.Duration; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import javax.json.*; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RegExFileFilter; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.ling.tokensregex.Env; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import edu.stanford.nlp.patterns.dep.DataInstanceDep; import edu.stanford.nlp.patterns.surface.*; import edu.stanford.nlp.patterns.ConstantsAndVariables.ScorePhraseMeasures; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.sequences.IOBUtils; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.stats.TwoDimensionalCounter; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.PriorityQueue; import edu.stanford.nlp.util.TypesafeMap.Key; import edu.stanford.nlp.util.logging.Redwood; /** * Given text and a seed list, this class gives more words like the seed words * by learning surface word or dependency patterns. * <p> * * The multi-threaded class ({@code nthread} parameter for number of * threads) takes as input. * * To use the default options, run * <p> * {@code java -mx1000m edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass -file text_file -seedWordsFiles label1,seedwordlist1;label2,seedwordlist2;... -outDir output_directory (optional)} * <p> * * {@code fileFormat}: (Optional) Default is text. Valid values are text * (or txt) and ser, where the serialized file is of the type {@code Map<String,List<CoreLabel>>}. * <p> * {@code file}: (Required) Input file(s) (default assumed text). Can be * one or more of (concatenated by comma or semi-colon): file, directory, files * with regex in the filename (for example: "mydir/health-.*-processed.txt") * <p> * {@code seedWordsFiles}: (Required) * label1,file_seed_words1;label2,file_seed_words2;... where file_seed_words are * files with list of seed words, one in each line * <p> * {@code outDir}: (Optional) output directory where visualization/output * files are stored * <p> * For other flags, see individual comments for each flag. * * <p> * To use a properties file, see * projects/core/data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using) * as an example for the flags and their brief descriptions. Run the code as: * {@code java -mx1000m -cp classpath edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass -props dir-as-above/example.properties} * * <p> * IMPORTANT: Many flags are described in the classes * {@link ConstantsAndVariables}, {@link edu.stanford.nlp.patterns.surface.CreatePatterns}, and * {@link PhraseScorer}. * * @author Sonal Gupta (sonal@cs.stanford.edu) */ public class GetPatternsFromDataMultiClass<E extends Pattern> implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(GetPatternsFromDataMultiClass.class); private static final long serialVersionUID = 1L; //public Map<String, Map<Integer, Set<E>>> patternsForEachToken = null; private PatternsForEachToken<E> patsForEachToken = null; public Map<String, Set<String>> wordsForOtherClass = null; // String channelNameLogger = "patterns"; /** * * RlogF is from Riloff 1996, when R's denominator is (pos+neg+unlabeled) * <p> * RlogFPosNeg is when the R's denominator is just (pos+negative) examples * <p> * PosNegOdds is just the ratio of number of positive words to number of * negative * <p> * PosNegUnlabOdds is just the ratio of number of positive words to number of * negative (unlabeled words + negative) * <p> * RatioAll is pos/(neg+pos+unlabeled) * <p> * YanGarber02 is the modified version presented in * "Unsupervised Learning of Generalized Names" * <p> * LOGREG is learning a logistic regression classifier to combine weights to * score a phrase (Same as PhEvalInPat, except score of an unlabeled phrase is * computed using a logistic regression classifier) * <p> * LOGREGlogP is learning a logistic regression classifier to combine weights * to score a phrase (Same as PhEvalInPatLogP, except score of an unlabeled * phrase is computed using a logistic regression classifier) * <p> * SqrtAllRatio is the pattern scoring used in Gupta et al. JAMIA 2014 paper * <p> * Below F1SeedPattern and BPB based on paper * "Unsupervised Method for Automatics Construction of a disease dictionary..." * <p> * Precision, Recall, and FMeasure (controlled by fbeta flag) is ranking the patterns using * their precision, recall and F_beta measure */ public enum PatternScoring { F1SeedPattern, RlogF, RlogFPosNeg, RlogFUnlabNeg, RlogFNeg, PhEvalInPat, PhEvalInPatLogP, PosNegOdds, YanGarber02, PosNegUnlabOdds, RatioAll, LOGREG, LOGREGlogP, SqrtAllRatio, LinICML03, kNN } enum WordScoring { BPB, WEIGHTEDNORM } private Map<String, Boolean> writtenPatInJustification = new HashMap<>(); private Map<String, Counter<E>> learnedPatterns = new HashMap<>(); //Same as learnedPatterns but with iteration information private Map<String, Map<Integer, Counter<E>>> learnedPatternsEachIter = new HashMap<>(); Map<String, Counter<CandidatePhrase>> matchedSeedWords = new HashMap<>(); public Map<String, TwoDimensionalCounter<CandidatePhrase, E>> wordsPatExtracted = new HashMap<>(); Properties props; public ScorePhrases scorePhrases; public ConstantsAndVariables constVars; public CreatePatterns createPats; private final DecimalFormat df = new DecimalFormat("#.##"); private boolean notComputedAllPatternsYet = true; /* * when there is only one label */ public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Set<CandidatePhrase> seedSet, boolean labelUsingSeedSets, String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel); } @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Set<CandidatePhrase> seedSet, boolean labelUsingSeedSets, Class answerClass, String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this.props = props; Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<>(); ansCl.put(answerLabel, answerClass); Map<String, Class> generalizeClasses = new HashMap<>(); Map<String, Map<Class, Object>> ignoreClasses = new HashMap<>(); ignoreClasses.put(answerLabel, new HashMap<>()); Map<String, Set<CandidatePhrase>> seedSets = new HashMap<>(); seedSets.put(answerLabel, seedSet); setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, ignoreClasses); } @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Set<CandidatePhrase> seedSet, boolean labelUsingSeedSets, String answerLabel, Map<String, Class> generalizeClasses, Map<Class, Object> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel, generalizeClasses, ignoreClasses); } @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Set<CandidatePhrase> seedSet, boolean labelUsingSeedSets, Class answerClass, String answerLabel, Map<String, Class> generalizeClasses, Map<Class, Object> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this.props = props; Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<>(); ansCl.put(answerLabel, answerClass); Map<String, Map<Class, Object>> iC = new HashMap<>(); iC.put(answerLabel, ignoreClasses); Map<String, Set<CandidatePhrase>> seedSets = new HashMap<>(); seedSets.put(answerLabel, seedSet); setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, iC); } @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Map<String, Set<CandidatePhrase>> seedSets, boolean labelUsingSeedSets) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, ClassNotFoundException, InterruptedException, ExecutionException { this.props = props; Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<>(); Map<String, Class> gC = new HashMap<>(); Map<String, Map<Class, Object>> iC = new HashMap<>(); int i = 1; for (String label : seedSets.keySet()) { String ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i; ansCl.put(label, (Class<? extends Key<String>>) Class.forName(ansclstr)); iC.put(label, new HashMap<>()); i++; } setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, gC, iC); } @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Map<String, Set<CandidatePhrase>> seedSets, boolean labelUsingSeedSets, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this(props, sents, seedSets, labelUsingSeedSets, answerClass, new HashMap<>(), new HashMap<>()); } /** * Generalize classes basically maps label strings to a map of generalized * strings and the corresponding class ignoreClasses have to be boolean. * * @throws IOException * @throws SecurityException * @throws NoSuchMethodException * @throws InvocationTargetException * @throws IllegalArgumentException * @throws IllegalAccessException * @throws InstantiationException * @throws ExecutionException * @throws InterruptedException * @throws ClassNotFoundException */ @SuppressWarnings("rawtypes") public GetPatternsFromDataMultiClass(Properties props, Map<String, DataInstance> sents, Map<String, Set<CandidatePhrase>> seedSets, boolean labelUsingSeedSets, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String, Class> generalizeClasses, Map<String, Map<Class, Object>> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { this.props = props; if (ignoreClasses.isEmpty()) { for (String label : seedSets.keySet()) ignoreClasses.put(label, new HashMap<>()); } setUpConstructor(sents, seedSets, labelUsingSeedSets, answerClass, generalizeClasses, ignoreClasses); } @SuppressWarnings("rawtypes") private void setUpConstructor(Map<String, DataInstance> sents, Map<String, Set<CandidatePhrase>> seedSets, boolean labelUsingSeedSets, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String, Class> generalizeClasses, Map<String, Map<Class, Object>> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException { Data.sents = sents; ArgumentParser.fillOptions(Data.class, props); ArgumentParser.fillOptions(ConstantsAndVariables.class, props); PatternFactory.setUp(props, PatternFactory.PatternType.valueOf(props.getProperty(Flags.patternType)), seedSets.keySet()); constVars = new ConstantsAndVariables(props, seedSets, answerClass, generalizeClasses, ignoreClasses); if (constVars.writeMatchedTokensFiles && constVars.batchProcessSents) { throw new RuntimeException( "writeMatchedTokensFiles and batchProcessSents cannot be true at the same time (not implemented; also doesn't make sense to save a large sentences json file)"); } if (constVars.debug < 1) { Redwood.hideChannelsEverywhere(ConstantsAndVariables.minimaldebug); } if (constVars.debug < 2) { Redwood.hideChannelsEverywhere(Redwood.DBG); } constVars.justify = true; if (constVars.debug < 3) { constVars.justify = false; } if (constVars.debug < 4) { Redwood.hideChannelsEverywhere(ConstantsAndVariables.extremedebug); } Redwood.log(Redwood.DBG, "Running with debug output"); Redwood.log(ConstantsAndVariables.extremedebug, "Running with extreme debug output"); wordsPatExtracted = new HashMap<>(); for (String label : answerClass.keySet()) { wordsPatExtracted.put(label, new TwoDimensionalCounter<>()); } scorePhrases = new ScorePhrases(props, constVars); createPats = new CreatePatterns(props, constVars); assert !(constVars.doNotApplyPatterns && (PatternFactory.useStopWordsBeforeTerm || PatternFactory.numWordsCompoundMax > 1)) : " Cannot have both doNotApplyPatterns and (useStopWordsBeforeTerm true or numWordsCompound > 1)!"; if(constVars.invertedIndexDirectory == null){ File f = File.createTempFile("inv","index"); f.deleteOnExit(); f.mkdir(); constVars.invertedIndexDirectory = f.getAbsolutePath(); } Set<String> extremelySmallStopWordsList = CollectionUtils.asSet(".", ",", "in", "on", "of", "a", "the", "an"); //Function to use to how to add CoreLabels to index Function<CoreLabel, Map<String, String>> transformCoreLabelToString = l -> { Map<String, String> add = new HashMap<>(); for (Class gn: constVars.getGeneralizeClasses().values()) { Object b = l.get(gn); if (b != null && !b.toString().equals(constVars.backgroundSymbol)) { add.put(Token.getKeyForClass(gn),b.toString()); } } return add; }; boolean createIndex = false; if (constVars.loadInvertedIndex) constVars.invertedIndex = SentenceIndex.loadIndex(constVars.invertedIndexClass, props, extremelySmallStopWordsList, constVars.invertedIndexDirectory, transformCoreLabelToString); else { constVars.invertedIndex = SentenceIndex.createIndex(constVars.invertedIndexClass, null, props, extremelySmallStopWordsList, constVars.invertedIndexDirectory, transformCoreLabelToString); createIndex = true; } int totalNumSents = 0; boolean computeDataFreq = false; if (Data.rawFreq == null) { Data.rawFreq = new ClassicCounter<>(); computeDataFreq = true; } ConstantsAndVariables.DataSentsIterator iter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while(iter.hasNext()){ Pair<Map<String, DataInstance>, File> sentsIter = iter.next(); Map<String, DataInstance> sentsf = sentsIter.first(); if(constVars.batchProcessSents) { for (Entry<String, DataInstance> en : sentsf.entrySet()) { Data.sentId2File.put(en.getKey(), sentsIter.second()); } } totalNumSents += sentsf.size(); if(computeDataFreq){ Data.computeRawFreqIfNull(sentsf, PatternFactory.numWordsCompoundMax); } Redwood.log(Redwood.DBG, "Initializing sents size " + sentsf.size() + " sentences, either by labeling with the seed set or just setting the right classes"); for (String l : constVars.getAnswerClass().keySet()) { Redwood.log(Redwood.DBG, "labelUsingSeedSets is " + labelUsingSeedSets + " and seed set size for " + l + " is " + (seedSets == null?"null":seedSets.get(l).size())); Set<CandidatePhrase> seed = seedSets == null || !labelUsingSeedSets ? new HashSet<>() : (seedSets.containsKey(l) ? seedSets.get(l) : new HashSet<>()); if(!matchedSeedWords.containsKey(l)){ matchedSeedWords.put(l, new ClassicCounter<>()); } Counter<CandidatePhrase> matched = runLabelSeedWords(sentsf, constVars.getAnswerClass().get(l), l, seed, constVars, labelUsingSeedSets); System.out.println("matched phrases for " + l + " is " + matched); matchedSeedWords.get(l).addAll(matched); if (constVars.addIndvWordsFromPhrasesExceptLastAsNeg) { Redwood.log(ConstantsAndVariables.minimaldebug, "adding indv words from phrases except last as neg"); Set<CandidatePhrase> otherseed = new HashSet<>(); if(labelUsingSeedSets){ for (CandidatePhrase s : seed) { String[] t = s.getPhrase().split("\\s+"); for (int i = 0; i < t.length - 1; i++) { if (!seed.contains(t[i])) { otherseed.add(CandidatePhrase.createOrGet(t[i])); } } } } runLabelSeedWords(sentsf, PatternsAnnotations.OtherSemanticLabel.class, "OTHERSEM", otherseed, constVars, labelUsingSeedSets); } } if (labelUsingSeedSets && constVars.getOtherSemanticClassesWords() != null) { String l = "OTHERSEM"; if(!matchedSeedWords.containsKey(l)){ matchedSeedWords.put(l, new ClassicCounter<>()); } matchedSeedWords.get(l).addAll(runLabelSeedWords(sentsf, PatternsAnnotations.OtherSemanticLabel.class, l, constVars.getOtherSemanticClassesWords(), constVars, labelUsingSeedSets)); } if(constVars.removeOverLappingLabelsFromSeed){ removeOverLappingLabels(sentsf); } if(createIndex) constVars.invertedIndex.add(sentsf, true); if(sentsIter.second().exists()){ Redwood.log(Redwood.DBG, "Saving the labeled seed sents (if given the option) to the same file " + sentsIter.second()); IOUtils.writeObjectToFile(sentsf, sentsIter.second()); } } Redwood.log(Redwood.DBG, "Done loading/creating inverted index of tokens and labeling data with total of " + constVars.invertedIndex.size() + " sentences"); //If the scorer class is LearnFeatWt then individual word class is added as a feature if (scorePhrases.phraseScorerClass.equals(ScorePhrasesAverageFeatures.class) && (constVars.usePatternEvalWordClass || constVars.usePhraseEvalWordClass)) { if (constVars.externalFeatureWeightsDir == null) { File f = File.createTempFile("tempfeat", ".txt"); f.delete(); f.deleteOnExit(); constVars.externalFeatureWeightsDir = f.getAbsolutePath(); } IOUtils.ensureDir(new File(constVars.externalFeatureWeightsDir)); for (String label : seedSets.keySet()) { String externalFeatureWeightsFileLabel = constVars.externalFeatureWeightsDir + "/" + label; File f = new File(externalFeatureWeightsFileLabel); if (!f.exists()) { Redwood.log(Redwood.DBG, "externalweightsfile for the label " + label + " does not exist: learning weights!"); LearnImportantFeatures lmf = new LearnImportantFeatures(); ArgumentParser.fillOptions(lmf, props); lmf.answerClass = answerClass.get(label); lmf.answerLabel = label; lmf.setUp(); lmf.getTopFeatures(new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents), constVars.perSelectRand, constVars.perSelectNeg, externalFeatureWeightsFileLabel); } Counter<Integer> distSimWeightsLabel = new ClassicCounter<>(); for (String line : IOUtils.readLines(externalFeatureWeightsFileLabel)) { String[] t = line.split(":"); if (!t[0].startsWith("Cluster")) continue; String s = t[0].replace("Cluster-", ""); Integer clusterNum = Integer.parseInt(s); distSimWeightsLabel.setCount(clusterNum, Double.parseDouble(t[1])); } constVars.distSimWeights.put(label, distSimWeightsLabel); } } // computing semantic odds values if (constVars.usePatternEvalSemanticOdds || constVars.usePhraseEvalSemanticOdds) { Counter<CandidatePhrase> dictOddsWeightsLabel = new ClassicCounter<>(); Counter<CandidatePhrase> otherSemanticClassFreq = new ClassicCounter<>(); for (CandidatePhrase s : constVars.getOtherSemanticClassesWords()) { for (String s1 : StringUtils.getNgrams(Arrays.asList(s.getPhrase().split("\\s+")), 1, PatternFactory.numWordsCompoundMax)) otherSemanticClassFreq.incrementCount(CandidatePhrase.createOrGet(s1)); } otherSemanticClassFreq = Counters.add(otherSemanticClassFreq, 1.0); // otherSemanticClassFreq.setDefaultReturnValue(1.0); Map<String, Counter<CandidatePhrase>> labelDictNgram = new HashMap<>(); for (String label : seedSets.keySet()) { Counter<CandidatePhrase> classFreq = new ClassicCounter<>(); for (CandidatePhrase s : seedSets.get(label)) { for (String s1 : StringUtils.getNgrams(Arrays.asList(s.getPhrase().split("\\s+")), 1, PatternFactory.numWordsCompoundMax)) classFreq.incrementCount(CandidatePhrase.createOrGet(s1)); } classFreq = Counters.add(classFreq, 1.0); labelDictNgram.put(label, classFreq); // classFreq.setDefaultReturnValue(1.0); } for (String label : seedSets.keySet()) { Counter<CandidatePhrase> otherLabelFreq = new ClassicCounter<>(); for (String label2 : seedSets.keySet()) { if (label.equals(label2)) continue; otherLabelFreq.addAll(labelDictNgram.get(label2)); } otherLabelFreq.addAll(otherSemanticClassFreq); dictOddsWeightsLabel = Counters.divisionNonNaN(labelDictNgram.get(label), otherLabelFreq); constVars.dictOddsWeights.put(label, dictOddsWeightsLabel); } } //Redwood.log(Redwood.DBG, "All options are:" + "\n" + Maps.toString(getAllOptions(), "","","\t","\n")); } public PatternsForEachToken getPatsForEachToken() { return patsForEachToken; } /** * If a token is labeled for two or more labels, then keep the one that has the longest matching phrase. For example, "lung" as BODYPART label and "lung cancer" as DISEASE label, * keep only the DISEASE label for "lung". For this to work, you need to have {@code PatternsAnnotations.Ln} set, which is already done in runLabelSeedWords function. */ private void removeOverLappingLabels(Map<String, DataInstance> sents){ for(Map.Entry<String, DataInstance> sentEn: sents.entrySet()){ for(CoreLabel l : sentEn.getValue().getTokens()){ Map<String, CandidatePhrase> longestMatchingMap = l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class); String longestMatchingString = ""; String longestMatchingLabel = null; for(Map.Entry<String, CandidatePhrase> en: longestMatchingMap.entrySet()){ if(en.getValue().getPhrase().length() > longestMatchingString.length()){ longestMatchingLabel = en.getKey(); longestMatchingString = en.getValue().getPhrase(); } } if(longestMatchingLabel != null){ if(!"OTHERSEM".equals(longestMatchingLabel)) l.set(PatternsAnnotations.OtherSemanticLabel.class, constVars.backgroundSymbol); for(Entry<String, Class<? extends Key<String>>> en: constVars.getAnswerClass().entrySet()) { if (!en.getKey().equals(longestMatchingLabel)){ l.set(en.getValue(), constVars.backgroundSymbol); } else l.set(en.getValue(), en.getKey()); } } } } } public static Map<String, DataInstance> runPOSNERParseOnTokens(Map<String, DataInstance> sents, Properties propsoriginal){ PatternFactory.PatternType type = PatternFactory.PatternType.valueOf(propsoriginal.getProperty(Flags.patternType)); Properties props = new Properties(); List<String> anns = new ArrayList<>(); anns.add("pos"); anns.add("lemma"); boolean useTargetParserParentRestriction = Boolean.parseBoolean(propsoriginal.getProperty(Flags.useTargetParserParentRestriction)); boolean useTargetNERRestriction = Boolean.parseBoolean(propsoriginal.getProperty(Flags.useTargetNERRestriction)); String posModelPath = props.getProperty(Flags.posModelPath); String numThreads = propsoriginal.getProperty(Flags.numThreads); if (useTargetParserParentRestriction){ anns.add("parse"); } else if(type.equals(PatternFactory.PatternType.DEP)) anns.add("depparse"); if (useTargetNERRestriction) { anns.add("ner"); } props.setProperty("annotators", StringUtils.join(anns, ",")); props.setProperty("parse.maxlen", "80"); props.setProperty("nthreads", numThreads); props.setProperty("threads", numThreads); // props.put( "tokenize.options", // "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); if (posModelPath != null) { props.setProperty("pos.model", posModelPath); } StanfordCoreNLP pipeline = new StanfordCoreNLP(props, false); Redwood.log(Redwood.DBG, "Annotating text"); for(Map.Entry<String, DataInstance> en: sents.entrySet()) { List<CoreMap> temp = new ArrayList<>(); CoreMap s= new ArrayCoreMap(); s.set(CoreAnnotations.TokensAnnotation.class, en.getValue().getTokens()); temp.add(s); Annotation doc = new Annotation(temp); try { pipeline.annotate(doc); if (useTargetParserParentRestriction) inferParentParseTag(s.get(TreeAnnotation.class)); } catch (Exception e) { log.warn("Ignoring error: for sentence " + StringUtils.joinWords(en.getValue().getTokens(), " ")); log.warn(e); } } Redwood.log(Redwood.DBG, "Done annotating text"); return sents; } public static Map<String, DataInstance> runPOSNEROnTokens(List<CoreMap> sentsCM, String posModelPath, boolean useTargetNERRestriction, String prefix, boolean useTargetParserParentRestriction, String numThreads, PatternFactory.PatternType type) { Annotation doc = new Annotation(sentsCM); Properties props = new Properties(); List<String> anns = new ArrayList<>(); anns.add("pos"); anns.add("lemma"); if (useTargetParserParentRestriction){ anns.add("parse"); } else if(type.equals(PatternFactory.PatternType.DEP)) anns.add("depparse"); if (useTargetNERRestriction) { anns.add("ner"); } props.setProperty("annotators", StringUtils.join(anns, ",")); props.setProperty("parse.maxlen", "80"); props.setProperty("nthreads", numThreads); props.setProperty("threads", numThreads); // props.put( "tokenize.options", // "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); if (posModelPath != null) { props.setProperty("pos.model", posModelPath); } StanfordCoreNLP pipeline = new StanfordCoreNLP(props, false); Redwood.log(Redwood.DBG, "Annotating text"); pipeline.annotate(doc); Redwood.log(Redwood.DBG, "Done annotating text"); Map<String, DataInstance> sents = new HashMap<>(); for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) { if (useTargetParserParentRestriction) inferParentParseTag(s.get(TreeAnnotation.class)); DataInstance d = DataInstance.getNewInstance(type, s); sents.put(prefix + s.get(CoreAnnotations.DocIDAnnotation.class), d); } return sents; } static StanfordCoreNLP pipeline = null; public static int tokenize(Iterator<String> textReader, String posModelPath, boolean lowercase, boolean useTargetNERRestriction, String sentIDPrefix, boolean useTargetParserParentRestriction, String numThreads, boolean batchProcessSents, int numMaxSentencesPerBatchFile, File saveSentencesSerDirFile, Map<String, DataInstance> sents, int numFilesTillNow, PatternFactory.PatternType type) throws InterruptedException, ExecutionException, IOException { if (pipeline == null) { Properties props = new Properties(); List<String> anns = new ArrayList<>(); anns.add("tokenize"); anns.add("ssplit"); anns.add("pos"); anns.add("lemma"); if (useTargetParserParentRestriction){ anns.add("parse"); } if(type.equals(PatternFactory.PatternType.DEP)) anns.add("depparse"); if (useTargetNERRestriction) { anns.add("ner"); } props.setProperty("annotators", StringUtils.join(anns, ",")); props.setProperty("parse.maxlen", "80"); if(numThreads != null) props.setProperty("threads", numThreads); props.setProperty("tokenize.options", "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); if (posModelPath != null) { props.setProperty("pos.model", posModelPath); } pipeline = new StanfordCoreNLP(props); } String text = ""; int numLines = 0; while(textReader.hasNext()) { String line = textReader.next(); numLines ++; if (batchProcessSents && numLines > numMaxSentencesPerBatchFile) { break; } if (lowercase) line = line.toLowerCase(); text += line+"\n"; } Annotation doc = new Annotation(text); pipeline.annotate(doc); int i = -1; for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) { i++; if (useTargetParserParentRestriction) inferParentParseTag(s.get(TreeAnnotation.class)); DataInstance d = DataInstance.getNewInstance(type, s); sents.put(sentIDPrefix + i, d); // if (batchProcessSents && sents.size() >= numMaxSentencesPerBatchFile) { // numFilesTillNow++; // File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow); // IOUtils.writeObjectToFile(sents, file); // sents = new HashMap<String, DataInstance>(); // Data.sentsFiles.add(file); // } } Redwood.log(Redwood.DBG, "Done annotating text with " + i + " sentences"); if (sents.size() > 0 && batchProcessSents) { numFilesTillNow++; File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow); IOUtils.writeObjectToFile(sents, file); Data.sentsFiles.add(file); for(String sentid: sents.keySet()) { assert !Data.sentId2File.containsKey(sentid) : "Data.sentId2File already contains " + sentid + ". Make sure sentIds are unique!"; Data.sentId2File.put(sentid, file); } sents.clear(); } // not lugging around sents if batch processing if (batchProcessSents) sents = null; return numFilesTillNow; } /* public static int tokenize(String text, String posModelPath, boolean lowercase, boolean useTargetNERRestriction, String sentIDPrefix, boolean useTargetParserParentRestriction, String numThreads, boolean batchProcessSents, int numMaxSentencesPerBatchFile, File saveSentencesSerDirFile, Map<String, DataInstance> sents, int numFilesTillNow) throws InterruptedException, ExecutionException, IOException { if (pipeline == null) { Properties props = new Properties(); List<String> anns = new ArrayList<String>(); anns.add("tokenize"); anns.add("ssplit"); anns.add("pos"); anns.add("lemma"); if (useTargetParserParentRestriction) { anns.add("parse"); } if (useTargetNERRestriction) { anns.add("ner"); } props.setProperty("annotators", StringUtils.join(anns, ",")); props.setProperty("parse.maxlen", "80"); props.setProperty("threads", numThreads); props.put("tokenize.options", "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); if (posModelPath != null) { props.setProperty("pos.model", posModelPath); } pipeline = new StanfordCoreNLP(props); } if (lowercase) text = text.toLowerCase(); Annotation doc = new Annotation(text); pipeline.annotate(doc); Redwood.log(Redwood.DBG, "Done annotating text"); int i = -1; for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) { i++; if (useTargetParserParentRestriction) inferParentParseTag(s.get(TreeAnnotation.class)); sents.put(sentIDPrefix + i, s.get(CoreAnnotations.TokensAnnotation.class)); if (batchProcessSents && sents.size() >= numMaxSentencesPerBatchFile) { numFilesTillNow++; File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow); IOUtils.writeObjectToFile(sents, file); sents = new HashMap<String, DataInstance>(); Data.sentsFiles.add(file); } } if (sents.size() > 0 && batchProcessSents) { numFilesTillNow++; File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow); IOUtils.writeObjectToFile(sents, file); Data.sentsFiles.add(file); sents.clear(); } // not lugging around sents if batch processing if (batchProcessSents) sents = null; return numFilesTillNow; } */ private static void inferParentParseTag(Tree tree) { String grandstr = tree.value(); for (Tree child : tree.children()) { for (Tree grand : child.children()) { if (grand.isLeaf()) { ((CoreLabel) grand.label()).set(CoreAnnotations.GrandparentAnnotation.class, grandstr); } } inferParentParseTag(child); } } /** * If l1 is a part of l2, it finds the starting index of l1 in l2 If l1 is not * a sub-array of l2, then it returns -1 note that l2 should have the exact * elements and order as in l1 * * @param l1 array you want to find in l2 * @param l2 * @return starting index of the sublist */ public static List<Integer> getSubListIndex(String[] l1, String[] l2, String[] subl2, Set<String> doNotLabelTheseWords, HashSet<String> seenFuzzyMatches, int minLen4Fuzzy, boolean fuzzyMatch, boolean ignoreCaseSeedMatch) { if (l1.length > l2.length) return null; EditDistance editDistance = new EditDistance(true); List<Integer> allIndices = new ArrayList<>(); boolean matched = false; int index = -1; int lastUnmatchedIndex = 0; for (int i = 0; i < l2.length;) { for (int j = 0; j < l1.length;) { boolean d1 = false, d2 = false; boolean compareFuzzy = true; if (!fuzzyMatch || doNotLabelTheseWords.contains(l2[i]) || doNotLabelTheseWords.contains(subl2[i]) || l2[i].length() <= minLen4Fuzzy || subl2[i].length() <= minLen4Fuzzy) compareFuzzy = false; if (compareFuzzy == false || l1[j].length() <= minLen4Fuzzy) { d1 = (ignoreCaseSeedMatch && l1[j].equalsIgnoreCase(l2[i])) || l1[j].equals(l2[i]); if (!d1 && fuzzyMatch) d2 = (ignoreCaseSeedMatch && subl2[i].equalsIgnoreCase(l1[j])) || subl2[i].equals(l1[j]); } else { String combo = l1[j] + "#" + l2[i]; if ((ignoreCaseSeedMatch && l1[j].equalsIgnoreCase(l2[i])) || l1[j].equals(l2[i]) || seenFuzzyMatches.contains(combo)) d1 = true; else { d1 = editDistance.score(l1[j], l2[i]) <= 1; if (!d1) { String combo2 = l1[j] + "#" + subl2[i]; if ((ignoreCaseSeedMatch && l1[j].equalsIgnoreCase(subl2[i]) )||l1[j].equals(subl2[i]) || seenFuzzyMatches.contains(combo2)) d2 = true; else { d2 = editDistance.score(l1[j], subl2[i]) <= 1; if (d2) { // System.out.println(l1[j] + " matched with " + subl2[i]); seenFuzzyMatches.add(combo2); } } } else if (d1) { // System.out.println(l1[j] + " matched with " + l2[i]); seenFuzzyMatches.add(combo); } } } // if (l1[j].equals(l2[i]) || subl2[i].equals(l1[j])) { if (d1 || d2) { index = i; i++; j++; if (j == l1.length) { matched = true; break; } } else { j = 0; i = lastUnmatchedIndex + 1; lastUnmatchedIndex = i; index = -1; if (lastUnmatchedIndex == l2.length) break; } if (i >= l2.length) { index = -1; break; } } if (i == l2.length || matched) { if (index >= 0) // index = index - l1.length + 1; allIndices.add(index - l1.length + 1); matched = false; lastUnmatchedIndex = index; // break; } } // get starting point return allIndices; } //if matchcontextlowercase is on, transform that. escape the word etc. Useful for pattern matching later on private static Function<CoreLabel, String> stringTransformationFunction = new Function<CoreLabel, String>() { @Override public String apply(CoreLabel l) { String s; if(PatternFactory.useLemmaContextTokens){ s = l.lemma(); assert s!=null : "Lemma is null and useLemmaContextTokens is true"; } else s= l.word(); if(ConstantsAndVariables.matchLowerCaseContext) s = s.toLowerCase(); assert s!= null; return s; } }; public static<E> List<List<E>> getThreadBatches(List<E> keyset, int numThreads){ int num; if (numThreads == 1) num = keyset.size(); else num = keyset.size() / (numThreads - 1); Redwood.log(ConstantsAndVariables.extremedebug, "keyset size is " + keyset.size()); List<List<E>> threadedSentIds = new ArrayList<>(); for (int i = 0; i < numThreads; i++) { List<E> keys = keyset.subList(i * num, Math.min(keyset.size(), (i + 1) * num)); threadedSentIds.add(keys); Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num + " till " + Math.min(keyset.size(), (i + 1) * num)); } return threadedSentIds; } /** Warning: sets labels of words that are not in the given seed set as O!!! * */ public static Counter<CandidatePhrase> runLabelSeedWords(Map<String, DataInstance> sents, Class answerclass, String label, Collection<CandidatePhrase> seedWords, ConstantsAndVariables constVars, boolean overwriteExistingLabels) throws InterruptedException, ExecutionException, IOException { Redwood.log(Redwood.DBG,"ignoreCaseSeedMatch is " + constVars.ignoreCaseSeedMatch); List<List<String>> threadedSentIds = getThreadBatches(new ArrayList<>(sents.keySet()), constVars.numThreads); ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads); List<Future<Pair<Map<String, DataInstance>, Counter<CandidatePhrase>>>> list = new ArrayList<>(); Counter<CandidatePhrase> matchedPhrasesCounter = new ClassicCounter<>(); for (List<String> keys: threadedSentIds) { Callable<Pair<Map<String, DataInstance>, Counter<CandidatePhrase>>> task = new LabelWithSeedWords(seedWords, sents, keys, answerclass, label, constVars.fuzzyMatch, constVars.minLen4FuzzyForPattern, constVars.backgroundSymbol, constVars.getEnglishWords(), stringTransformationFunction, constVars.writeMatchedTokensIdsForEachPhrase, overwriteExistingLabels, constVars.patternType, constVars.ignoreCaseSeedMatch); Pair<Map<String, DataInstance>, Counter<CandidatePhrase>> sentsi = executor.submit(task).get(); sents.putAll(sentsi.first()); matchedPhrasesCounter.addAll(sentsi.second()); } executor.shutdown(); Redwood.log("extremedebug","Matched phrases freq is " + matchedPhrasesCounter); return matchedPhrasesCounter; } public static void getFeatures(SemanticGraph graph, IndexedWord vertex, boolean isHead, Collection<String> features, GrammaticalRelation reln){ if(isHead){ List<Pair<GrammaticalRelation, IndexedWord>> pt = graph.parentPairs(vertex); for(Pair<GrammaticalRelation, IndexedWord> en: pt) { features.add("PARENTREL-" + en.first()); } } else{ //find the relation to the parent if(reln == null){ List<SemanticGraphEdge> parents = graph.getOutEdgesSorted(vertex); if(parents.size() > 0) reln = parents.get(0).getRelation(); } if(reln != null) features.add("REL-" + reln.getShortName()); } //System.out.println("For graph " + graph.toFormattedString() + " and vertex " + vertex + " the features are " + features); } /** * Warning: sets labels of words that are not in the given seed set as O!!! */ @SuppressWarnings("rawtypes") public static class LabelWithSeedWords implements Callable<Pair<Map<String, DataInstance>, Counter<CandidatePhrase>>> { Map<CandidatePhrase, String[]> seedwordsTokens = new HashMap<>(); Map<String, DataInstance> sents; List<String> keyset; Class labelClass; HashSet<String> seenFuzzyMatches = new HashSet<>(); String label; int minLen4FuzzyForPattern; String backgroundSymbol = "O"; Set<String> doNotLabelDictWords = null; Function<CoreLabel, String> stringTransformation; boolean writeMatchedTokensIdsForEachPhrase = false; boolean overwriteExistingLabels; PatternFactory.PatternType patternType; boolean fuzzyMatch = false; Map<String, String> ignoreCaseSeedMatch; public LabelWithSeedWords(Collection<CandidatePhrase> seedwords, Map<String, DataInstance> sents, List<String> keyset, Class labelclass, String label, boolean fuzzyMatch, int minLen4FuzzyForPattern, String backgroundSymbol, Set<String> doNotLabelDictWords, Function<CoreLabel, String> stringTransformation, boolean writeMatchedTokensIdsForEachPhrase, boolean overwriteExistingLabels, PatternFactory.PatternType type, Map<String, String> ignoreCaseSeedMatch) { for (CandidatePhrase s : seedwords) this.seedwordsTokens.put(s, s.getPhrase().split("\\s+")); this.sents = sents; this.keyset = keyset; this.labelClass = labelclass; this.label = label; this.minLen4FuzzyForPattern= minLen4FuzzyForPattern; this.backgroundSymbol = backgroundSymbol; this.doNotLabelDictWords = doNotLabelDictWords; this.stringTransformation = stringTransformation; this.writeMatchedTokensIdsForEachPhrase = writeMatchedTokensIdsForEachPhrase; this.overwriteExistingLabels = overwriteExistingLabels; this.patternType = type; this.fuzzyMatch = fuzzyMatch; this.ignoreCaseSeedMatch = ignoreCaseSeedMatch; } @SuppressWarnings("unchecked") @Override public Pair<Map<String, DataInstance>,Counter<CandidatePhrase>> call() { Map<String, DataInstance> newsent = new HashMap<>(); Counter<CandidatePhrase> matchedPhrasesCounter = new ClassicCounter<>(); for (String k : keyset) { DataInstance sent = sents.get(k); List<CoreLabel> tokensCore = sent.getTokens(); SemanticGraph graph = null; if(patternType.equals(PatternFactory.PatternType.DEP)){ graph = ((DataInstanceDep)sent).getGraph(); } String[] tokens = new String[tokensCore.size()]; String[] tokenslemma = new String[tokensCore.size()]; int num = 0; for (CoreLabel l : tokensCore) { //Setting the processedTextAnnotation, used in indexing and pattern matching l.set(PatternsAnnotations.ProcessedTextAnnotation.class, stringTransformation.apply(l)); tokens[num] = l.word(); if(fuzzyMatch && l.lemma() == null) throw new RuntimeException("how come lemma is null"); tokenslemma[num] = l.lemma(); num++; } boolean[] labels = new boolean[tokens.length]; CollectionValuedMap<Integer, CandidatePhrase> matchedPhrases = new CollectionValuedMap<>(); Map<Integer, CandidatePhrase> longestMatchedPhrases = new HashMap<>(); for (Entry<CandidatePhrase, String[]> sEn : seedwordsTokens.entrySet()) { String[] s = sEn.getValue(); CandidatePhrase sc = sEn.getKey(); List<Integer> indices = getSubListIndex(s, tokens, tokenslemma, doNotLabelDictWords, seenFuzzyMatches, minLen4FuzzyForPattern, fuzzyMatch, (ignoreCaseSeedMatch.containsKey(label) ? Boolean.valueOf(ignoreCaseSeedMatch.get(label)) : false)); if (indices != null && !indices.isEmpty()){ String ph = StringUtils.join(s, " "); sc.addFeature("LENGTH-" + s.length, 1.0); Collection<String> features = new ArrayList<>(); for (int index : indices){ if(graph != null){ GetPatternsFromDataMultiClass.getFeatures(graph, graph.getNodeByIndex(index + 1), true, features, null); } if(writeMatchedTokensIdsForEachPhrase) { addToMatchedTokensByPhrase(ph, k, index, s.length); } for (int i = 0; i < s.length; i++) { matchedPhrases.add(index + i, sc); if(graph != null){ try{ GetPatternsFromDataMultiClass.getFeatures(graph, graph.getNodeByIndex(index+ i + 1), false, features, null); } catch(Exception e) { log.warn(e); } } CandidatePhrase longPh = longestMatchedPhrases.get(index+i); longPh = longPh != null && longPh.getPhrase().length() > sc.getPhrase().length() ? longPh: sc; longestMatchedPhrases.put(index+i, longPh); labels[index + i] = true; } } sc.addFeatures(features); } } int i = -1; for (CoreLabel l : sent.getTokens()) { i++; //The second clause is for old sents ser files compatibility reason if (!l.containsKey(PatternsAnnotations.MatchedPhrases.class) || !(PatternsAnnotations.MatchedPhrases.class.isInstance(l.get(PatternsAnnotations.MatchedPhrases.class)))) l.set(PatternsAnnotations.MatchedPhrases.class, new CollectionValuedMap<>()); if(!l.containsKey(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class)) l.set(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class, new HashMap<>()); if (labels[i]) { l.set(labelClass, label); //set whether labeled by the seeds or not if(!l.containsKey(PatternsAnnotations.SeedLabeledOrNot.class)) l.set(PatternsAnnotations.SeedLabeledOrNot.class, new HashMap<>()); l.get(PatternsAnnotations.SeedLabeledOrNot.class).put(labelClass, true); CandidatePhrase longestMatchingPh = l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class).get(label); assert longestMatchedPhrases.containsKey(i); longestMatchingPh = (longestMatchingPh != null && (longestMatchingPh.getPhrase().length() > longestMatchedPhrases.get(i).getPhrase().length())) ? longestMatchingPh : longestMatchedPhrases.get(i); l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class).put(label, longestMatchingPh); matchedPhrasesCounter.incrementCount(longestMatchingPh, 1.0); l.get(PatternsAnnotations.MatchedPhrases.class).addAll(label, matchedPhrases.get(i)); Redwood.log(ConstantsAndVariables.extremedebug, "labeling " + l.word() + " or its lemma " + l.lemma() + " as " + label + " because of the dict phrases " + matchedPhrases.get(i)); } else if(overwriteExistingLabels) l.set(labelClass, backgroundSymbol); } newsent.put(k, sent); } return new Pair(newsent, matchedPhrasesCounter); } } private static void addToMatchedTokensByPhrase(String ph, String sentid, int index, int length){ if(!Data.matchedTokensForEachPhrase.containsKey(ph)) Data.matchedTokensForEachPhrase.put(ph, new HashMap<>()); Map<String, List<Integer>> matcheds = Data.matchedTokensForEachPhrase.get(ph); if(!matcheds.containsKey(sentid)) matcheds.put(sentid, new ArrayList<>()); for (int i = 0; i < length; i++) matcheds.get(sentid).add(index + i); } public Map<String, TwoDimensionalCounter<E, CandidatePhrase>> patternsandWords = null; //public Map<String, TwoDimensionalCounter<E, String>> allPatternsandWords = null; public Map<String, Counter<E>> currentPatternWeights = null; //deleteExistingIndex is def false for the second call to this function public void processSents(Map<String, DataInstance> sents, Boolean deleteExistingIndex) throws IOException, ClassNotFoundException { if (constVars.computeAllPatterns) { props.setProperty("createTable", deleteExistingIndex.toString()); props.setProperty("deleteExisting", deleteExistingIndex.toString()); props.setProperty("createPatLuceneIndex", deleteExistingIndex.toString()); Redwood.log(Redwood.DBG, "Computing all patterns"); createPats.getAllPatterns(sents, props, constVars.storePatsForEachToken); } else Redwood.log(Redwood.DBG, "Reading patterns from existing dir"); props.setProperty("createTable", "false"); props.setProperty("deleteExisting","false"); props.setProperty("createPatLuceneIndex","false"); } private void readSavedPatternsAndIndex() throws IOException, ClassNotFoundException { if(!constVars.computeAllPatterns) { assert constVars.allPatternsDir != null : "allPatternsDir flag cannot be empty if computeAllPatterns is false!"; //constVars.setPatternIndex(PatternIndex.load(constVars.allPatternsDir, constVars.storePatsIndex)); if(constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY)) patsForEachToken.load(constVars.allPatternsDir); } } @SuppressWarnings({ "unchecked" }) public Counter<E> getPatterns(String label, Set<E> alreadyIdentifiedPatterns, E p0, Counter<CandidatePhrase> p0Set, Set<E> ignorePatterns) throws IOException, ClassNotFoundException { TwoDimensionalCounter<E, CandidatePhrase> patternsandWords4Label = new TwoDimensionalCounter<>(); TwoDimensionalCounter<E, CandidatePhrase> negPatternsandWords4Label = new TwoDimensionalCounter<>(); //TwoDimensionalCounter<E, String> posnegPatternsandWords4Label = new TwoDimensionalCounter<E, String>(); TwoDimensionalCounter<E, CandidatePhrase> unLabeledPatternsandWords4Label = new TwoDimensionalCounter<>(); //TwoDimensionalCounter<E, String> negandUnLabeledPatternsandWords4Label = new TwoDimensionalCounter<E, String>(); //TwoDimensionalCounter<E, String> allPatternsandWords4Label = new TwoDimensionalCounter<E, String>(); Set<String> allCandidatePhrases = new HashSet<>(); ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); boolean firstCallToProcessSents = true; while(sentsIter.hasNext()){ Pair<Map<String, DataInstance>, File> sentsPair = sentsIter.next(); if(notComputedAllPatternsYet){ //in the first iteration processSents(sentsPair.first(), firstCallToProcessSents); firstCallToProcessSents = false; if(patsForEachToken == null){ //in the first iteration, for the first file patsForEachToken = PatternsForEachToken.getPatternsInstance(props, constVars.storePatsForEachToken); readSavedPatternsAndIndex(); } } this.calculateSufficientStats(sentsPair.first(), patsForEachToken, label, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, allCandidatePhrases); } notComputedAllPatternsYet = false; if (constVars.computeAllPatterns){ if(constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.DB)) patsForEachToken.createIndexIfUsingDBAndNotExists(); // String systemdir = System.getProperty("java.io.tmpdir"); // File tempFile= File.createTempFile("patterns", ".tmp", new File(systemdir)); // tempFile.deleteOnExit(); // tempFile.delete(); // constVars.allPatternsDir = tempFile.getAbsolutePath(); if(constVars.allPatternsDir != null){ IOUtils.ensureDir(new File(constVars.allPatternsDir)); patsForEachToken.save(constVars.allPatternsDir); } //savePatternIndex(constVars.allPatternsDir); } patsForEachToken.close(); //This is important. It makes sure that we don't recompute patterns in every iteration! constVars.computeAllPatterns = false; if (patternsandWords == null) patternsandWords = new HashMap<>(); if (currentPatternWeights == null) currentPatternWeights = new HashMap<>(); Counter<E> currentPatternWeights4Label = new ClassicCounter<>(); Set<E> removePats = enforceMinSupportRequirements(patternsandWords4Label, unLabeledPatternsandWords4Label); Counters.removeKeys(patternsandWords4Label, removePats); Counters.removeKeys(unLabeledPatternsandWords4Label, removePats); Counters.removeKeys(negPatternsandWords4Label, removePats); ScorePatterns scorePatterns; Class<?> patternscoringclass = getPatternScoringClass(constVars.patternScoring); if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsF1.class)) { scorePatterns = new ScorePatternsF1(constVars, constVars.patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props, p0Set, p0); Counter<E> finalPat = scorePatterns.score(); Counters.removeKeys(finalPat, alreadyIdentifiedPatterns); Counters.retainNonZeros(finalPat); Counters.retainTop(finalPat, constVars.numPatterns); if (Double.isNaN(Counters.max(finalPat))) throw new RuntimeException("how is the value NaN"); Redwood.log(ConstantsAndVariables.minimaldebug, "Selected Patterns: " + finalPat); return finalPat; } else if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsRatioModifiedFreq.class)) { scorePatterns = new ScorePatternsRatioModifiedFreq(constVars, constVars.patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, phInPatScoresCache, scorePhrases, props); } else if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsFreqBased.class)) { scorePatterns = new ScorePatternsFreqBased(constVars, constVars.patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props); } else if (constVars.patternScoring.equals(PatternScoring.kNN)) { try { Class<? extends ScorePatterns> clazz = (Class<? extends ScorePatterns>) Class.forName("edu.stanford.nlp.patterns.ScorePatternsKNN"); Constructor<? extends ScorePatterns> ctor = clazz.getConstructor(ConstantsAndVariables.class, PatternScoring.class, String.class, Set.class, TwoDimensionalCounter.class, TwoDimensionalCounter.class, TwoDimensionalCounter.class, ScorePhrases.class, Properties.class); scorePatterns = ctor.newInstance(constVars, constVars.patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, scorePhrases, props); } catch (ClassNotFoundException e) { throw new RuntimeException("kNN pattern scoring is not released yet. Stay tuned."); } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException e) { throw new RuntimeException("newinstance of kNN not created", e); } } else { throw new RuntimeException(constVars.patternScoring + " is not implemented (check spelling?). "); } scorePatterns.setUp(props); currentPatternWeights4Label = scorePatterns.score(); Redwood.log(ConstantsAndVariables.extremedebug, "patterns counter size is " + currentPatternWeights4Label.size()); if (ignorePatterns != null && !ignorePatterns.isEmpty()) { Counters.removeKeys(currentPatternWeights4Label, ignorePatterns); Redwood.log(ConstantsAndVariables.extremedebug, "Removing patterns from ignorePatterns of size " + ignorePatterns.size() + ". New patterns size " + currentPatternWeights4Label.size()); } if (alreadyIdentifiedPatterns != null && !alreadyIdentifiedPatterns.isEmpty()) { Redwood.log(ConstantsAndVariables.extremedebug, "Patterns size is " + currentPatternWeights4Label.size()); Counters.removeKeys(currentPatternWeights4Label, alreadyIdentifiedPatterns); Redwood.log(ConstantsAndVariables.extremedebug, "Removing already identified patterns of size " + alreadyIdentifiedPatterns.size() + ". New patterns size " + currentPatternWeights4Label.size()); } PriorityQueue<E> q = Counters.toPriorityQueue(currentPatternWeights4Label); int num = 0; Counter<E> chosenPat = new ClassicCounter<>(); Set<E> removePatterns = new HashSet<>(); Set<E> removeIdentifiedPatterns = null; while (num < constVars.numPatterns && !q.isEmpty()) { E pat = q.removeFirst(); //E pat = constVars.getPatternIndex().get(patindex); if (currentPatternWeights4Label.getCount(pat) < constVars.thresholdSelectPattern) { Redwood.log(Redwood.DBG, "The max weight of candidate patterns is " + df.format(currentPatternWeights4Label.getCount(pat)) + " so not adding anymore patterns"); break; } boolean notchoose = false; if (!unLabeledPatternsandWords4Label.containsFirstKey(pat) || unLabeledPatternsandWords4Label.getCounter(pat).isEmpty()) { Redwood.log(ConstantsAndVariables.extremedebug, "Removing pattern " + pat + " because it has no unlab support; pos words: " + patternsandWords4Label.getCounter(pat)); notchoose = true; continue; } Set<E> removeChosenPats = null; if (!notchoose) { if (alreadyIdentifiedPatterns != null) { for (E p : alreadyIdentifiedPatterns) { if (Pattern.subsumes(constVars.patternType, pat, p)) { // if (pat.getNextContextStr().contains(p.getNextContextStr()) && // pat.getPrevContextStr().contains(p.getPrevContextStr())) { Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + pat + " because it is contained in or contains the already chosen pattern " + p); notchoose = true; break; } int rest = pat.equalContext(p); // the contexts dont match if (rest == Integer.MAX_VALUE) continue; // if pat is less restrictive, remove p and add pat! if (rest < 0) { if(removeIdentifiedPatterns == null) removeIdentifiedPatterns = new HashSet<>(); removeIdentifiedPatterns.add(p); } else { notchoose = true; break; } } } } // In this iteration: if (!notchoose) { for (Pattern p : chosenPat.keySet()) { //E p = constVars.getPatternIndex().get(pindex); boolean removeChosenPatFlag = false; if (Pattern.sameGenre(constVars.patternType, pat, p)) { if(Pattern.subsumes(constVars.patternType, pat, p)){ Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + pat + " because it is contained in or contains the already chosen pattern " + p); notchoose = true; break; } else if (E.subsumes(constVars.patternType, p, pat)) { //subsume is true even if equal context //check if equal context int rest = pat.equalContext(p); // the contexts do not match if (rest == Integer.MAX_VALUE) { Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + p + " because it is contained in or contains another chosen pattern in this iteration " + pat); removeChosenPatFlag = true; } // if pat is less restrictive, remove p from chosen patterns and // add pat! else if (rest < 0) { removeChosenPatFlag = true; } else { notchoose = true; break; } } if (removeChosenPatFlag) { if(removeChosenPats == null) removeChosenPats = new HashSet<>(); removeChosenPats.add(pat); num--; } } } } if (notchoose) { Redwood.log(Redwood.DBG, "Not choosing " + pat + " for whatever reason!"); continue; } if (removeChosenPats != null) { Redwood.log(ConstantsAndVariables.extremedebug, "Removing already chosen patterns in this iteration " + removeChosenPats + " in favor of " + pat); Counters.removeKeys(chosenPat, removeChosenPats); } if (removeIdentifiedPatterns != null) { Redwood.log(ConstantsAndVariables.extremedebug, "Removing already identified patterns " + removeIdentifiedPatterns + " in favor of " + pat); removePatterns.addAll(removeIdentifiedPatterns); } chosenPat.setCount(pat, currentPatternWeights4Label.getCount(pat)); num++; } this.removeLearnedPatterns(label, removePatterns); Redwood.log(Redwood.DBG, "final size of the patterns is " + chosenPat.size()); Redwood.log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Patterns for " + label + "##\n"); List<Pair<E, Double>> chosenPatSorted = Counters.toSortedListWithCounts(chosenPat); for (Pair<E, Double> en : chosenPatSorted) Redwood.log(ConstantsAndVariables.minimaldebug, en.first() + ":" + df.format(en.second) + "\n"); if (constVars.outDir != null && !constVars.outDir.isEmpty()) { CollectionValuedMap<E, CandidatePhrase> posWords = new CollectionValuedMap<>(); for (Entry<E, ClassicCounter<CandidatePhrase>> en : patternsandWords4Label.entrySet()) { posWords.addAll(en.getKey(), en.getValue().keySet()); } CollectionValuedMap<E, CandidatePhrase> negWords = new CollectionValuedMap<>(); for (Entry<E, ClassicCounter<CandidatePhrase>> en : negPatternsandWords4Label.entrySet()) { negWords.addAll(en.getKey(), en.getValue().keySet()); } CollectionValuedMap<E, CandidatePhrase> unlabWords = new CollectionValuedMap<>(); for (Entry<E, ClassicCounter<CandidatePhrase>> en : unLabeledPatternsandWords4Label.entrySet()) { unlabWords.addAll(en.getKey(), en.getValue().keySet()); } if (constVars.outDir != null) { String outputdir = constVars.outDir + "/" + constVars.identifier + "/" + label; Redwood.log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); IOUtils.ensureDir(new File(outputdir)); String filename = outputdir + "/patterns" + ".json"; JsonArrayBuilder obj = Json.createArrayBuilder(); if (writtenPatInJustification.containsKey(label) && writtenPatInJustification.get(label)) { JsonReader jsonReader = Json.createReader(new BufferedInputStream(new FileInputStream(filename))); JsonArray objarr = jsonReader.readArray(); jsonReader.close(); for (JsonValue o : objarr) obj.add(o); } else obj = Json.createArrayBuilder(); JsonObjectBuilder objThisIter = Json.createObjectBuilder(); for (Pair<E, Double> pat : chosenPatSorted) { JsonObjectBuilder o = Json.createObjectBuilder(); JsonArrayBuilder pos = Json.createArrayBuilder(); JsonArrayBuilder neg = Json.createArrayBuilder(); JsonArrayBuilder unlab = Json.createArrayBuilder(); for (CandidatePhrase w : posWords.get(pat.first())) pos.add(w.getPhrase()); for (CandidatePhrase w : negWords.get(pat.first())) neg.add(w.getPhrase()); for (CandidatePhrase w : unlabWords.get(pat.first())) unlab.add(w.getPhrase()); o.add("Positive", pos); o.add("Negative", neg); o.add("Unlabeled", unlab); o.add("Score", pat.second()); objThisIter.add(pat.first().toStringSimple(), o); } obj.add(objThisIter.build()); IOUtils.ensureDir(new File(filename).getParentFile()); IOUtils.writeStringToFile(StringUtils.normalize(StringUtils.toAscii(obj.build().toString())), filename, "ASCII"); writtenPatInJustification.put(label, true); } } if (constVars.justify) { Redwood.log(Redwood.DBG, "Justification for Patterns:"); for (E key : chosenPat.keySet()) { Redwood.log(Redwood.DBG, "\nPattern: " + key); Redwood.log( Redwood.DBG, "Positive Words:" + Counters.toSortedString(patternsandWords4Label.getCounter(key), patternsandWords4Label.getCounter(key).size(), "%1$s:%2$f", ";")); Redwood.log( Redwood.DBG, "Negative Words:" + Counters.toSortedString(negPatternsandWords4Label.getCounter(key), negPatternsandWords4Label.getCounter(key).size(), "%1$s:%2$f", ";")); Redwood.log( Redwood.DBG, "Unlabeled Words:" + Counters.toSortedString(unLabeledPatternsandWords4Label.getCounter(key), unLabeledPatternsandWords4Label.getCounter(key).size(), "%1$s:%2$f", ";")); } } //allPatternsandWords.put(label, allPatternsandWords4Label); patternsandWords.put(label, patternsandWords4Label); currentPatternWeights.put(label, currentPatternWeights4Label); return chosenPat; } // private void savePatternIndex(String dir ) throws IOException { // if(dir != null) { // IOUtils.ensureDir(new File(dir)); // constVars.getPatternIndex().save(dir); // } // //patsForEachToken.savePatternIndex(constVars.getPatternIndex(), dir); // // } public static Class getPatternScoringClass(PatternScoring patternScoring) { if (patternScoring.equals(PatternScoring.F1SeedPattern)) { return ScorePatternsF1.class; } else if (patternScoring.equals(PatternScoring.PosNegUnlabOdds) || patternScoring.equals(PatternScoring.PosNegOdds) || patternScoring.equals(PatternScoring.RatioAll) || patternScoring.equals(PatternScoring.PhEvalInPat) || patternScoring.equals(PatternScoring.PhEvalInPatLogP) || patternScoring.equals(PatternScoring.LOGREG) || patternScoring.equals(PatternScoring.LOGREGlogP) || patternScoring.equals(PatternScoring.SqrtAllRatio)) { return ScorePatternsRatioModifiedFreq.class; } else if (patternScoring.equals(PatternScoring.RlogF) || patternScoring.equals(PatternScoring.RlogFPosNeg) || patternScoring.equals(PatternScoring.RlogFUnlabNeg) || patternScoring.equals(PatternScoring.RlogFNeg) || patternScoring.equals(PatternScoring.YanGarber02) || patternScoring.equals(PatternScoring.LinICML03)) { return ScorePatternsFreqBased.class; } else { return null; } } private static AtomicInteger numCallsToCalStats = new AtomicInteger(); private static <E> List<List<E>> splitIntoNumThreadsWithSampling(List<E> c, int n, int numThreads) { if (n < 0) throw new IllegalArgumentException("n < 0: " + n); if (n > c.size()) throw new IllegalArgumentException("n > size of collection: " + n + ", " + c.size()); List<List<E>> resultAll = new ArrayList<>(numThreads); int num; if (numThreads == 1) num = n; else num = n / (numThreads - 1); System.out.println("shuffled " + c.size() + " sentences and selecting " + num + " sentences per thread"); List<E> result = new ArrayList<>(num); int totalitems = 0; int nitem = 0; Random r = new Random(numCallsToCalStats.incrementAndGet()); boolean[] added = new boolean[c.size()]; // Arrays.fill(added, false); // not needed; get false by default while(totalitems < n){ //find the new sample index int index; do{ index = r.nextInt(c.size()); }while(added[index]); added[index] = true; E c1 = c.get(index); if(nitem == num){ resultAll.add(result); result = new ArrayList<>(num); nitem= 0; } result.add(c1); totalitems++; nitem ++; } if(!result.isEmpty()) resultAll.add(result); return resultAll; } //for each pattern, it calculates positive, negative, and unlabeled words private void calculateSufficientStats(Map<String, DataInstance> sents, PatternsForEachToken patternsForEachToken, String label, TwoDimensionalCounter<E, CandidatePhrase> patternsandWords4Label, TwoDimensionalCounter<E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter<E, CandidatePhrase> unLabeledPatternsandWords4Label, Set<String> allCandidatePhrases) { Redwood.log(Redwood.DBG,"calculating sufficient stats"); patternsForEachToken.setupSearch(); // calculating the sufficient statistics Class answerClass4Label = constVars.getAnswerClass().get(label); int sampleSize = constVars.sampleSentencesForSufficientStats == 1.0 ? sents.size(): (int) Math.round(constVars.sampleSentencesForSufficientStats*sents.size()); List<List<String>> sampledSentIds = splitIntoNumThreadsWithSampling(CollectionUtils.toList(sents.keySet()), sampleSize, constVars.numThreads); Redwood.log(Redwood.DBG,"sampled " + sampleSize + " sentences (" + constVars.sampleSentencesForSufficientStats*100 + "%)"); ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads); List<Future<Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>>>> list = new ArrayList<>(); for (List<String> sampledSents : sampledSentIds) { Callable<Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>>> task = new CalculateSufficientStatsThreads(patternsForEachToken, sampledSents, sents, label, answerClass4Label); Future<Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>>> submit = executor.submit(task); list.add(submit); } // Now retrieve the result for (Future<Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>>> future : list) { try { Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>> stats = future.get(); addStats(patternsandWords4Label, stats.first()); addStats(negPatternsandWords4Label, stats.second()); addStats(unLabeledPatternsandWords4Label, stats.third()); } catch (Exception e) { executor.shutdownNow(); throw new RuntimeException(e); } } executor.shutdown(); } private void addStats(TwoDimensionalCounter<E, CandidatePhrase> pw, List<Pair<E, CandidatePhrase>> v) { for(Pair<E, CandidatePhrase> w: v){ pw.incrementCount(w.first(), w.second()); } } private class CalculateSufficientStatsThreads implements Callable{ private final Map<String, DataInstance> sents; private final PatternsForEachToken patternsForEachToken; private final Collection<String> sentIds; private final String label; private final Class answerClass4Label; public CalculateSufficientStatsThreads(PatternsForEachToken patternsForEachToken, Collection<String> sentIds, Map<String, DataInstance> sents,String label, Class answerClass4Label){ this.patternsForEachToken = patternsForEachToken; this.sentIds = sentIds; this.sents = sents; this.label = label; this.answerClass4Label = answerClass4Label; } @Override public Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>> call() throws Exception { List<Pair<E, CandidatePhrase>> posWords = new ArrayList<>(); List<Pair<E, CandidatePhrase>> negWords = new ArrayList<>(); List<Pair<E, CandidatePhrase>> unlabWords = new ArrayList<>(); for(String sentId: sentIds){ Map<Integer, Set<E>> pat4Sent = patternsForEachToken.getPatternsForAllTokens(sentId); if (pat4Sent == null) { throw new RuntimeException("How come there are no patterns for " + sentId); } DataInstance sent = sents.get(sentId); List<CoreLabel> tokens = sent.getTokens(); for (int i = 0; i < tokens.size(); i++) { CoreLabel token = tokens.get(i); //Map<String, Set<String>> matchedPhrases = token.get(PatternsAnnotations.MatchedPhrases.class); CandidatePhrase tokenWordOrLemma = CandidatePhrase.createOrGet(token.word()); CandidatePhrase longestMatchingPhrase; if (constVars.useMatchingPhrase) { Map<String, CandidatePhrase> longestMatchingPhrases = token.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class); longestMatchingPhrase = longestMatchingPhrases.get(label); longestMatchingPhrase = (longestMatchingPhrase !=null && (longestMatchingPhrase.getPhrase().length() > tokenWordOrLemma.getPhrase().length()))? longestMatchingPhrase : tokenWordOrLemma; /*if (matchedPhrases != null && !matchedPhrases.isEmpty()) { for (String s : matchedPhrases) { if (s.equals(tokenWordOrLemma)) { longestMatchingPhrase = tokenWordOrLemma; break; } if (longestMatchingPhrase == null || longestMatchingPhrase.length() > s.length()) { longestMatchingPhrase = s; } } } else { longestMatchingPhrase = tokenWordOrLemma; }*/ } else longestMatchingPhrase = tokenWordOrLemma; Set<E> pats = pat4Sent.get(i); //make a copy of pats because we are changing numwordscompound etc. Set newpats = new HashSet<E>(); boolean changedpats = false; for (E s : pats) { if(s instanceof SurfacePattern){ changedpats = true; SurfacePattern snew = ((SurfacePattern) s).copyNewToken(); snew.setNumWordsCompound(PatternFactory.numWordsCompoundMapped.get(label)); newpats.add(snew); } } if(changedpats) pats = newpats; //This happens when dealing with the collapseddependencies if (pats == null) { if(!constVars.patternType.equals(PatternFactory.PatternType.DEP)) throw new RuntimeException("Why are patterns null for sentence " + sentId + " and token " + i + "(" + tokens.get(i) + "). pat4Sent has token ids " + pat4Sent.keySet() + (constVars.batchProcessSents ? "" : ". The sentence is " + Data.sents.get(sentId)) + ". If you have changed parameters, recompute all patterns."); continue; } // Set<E> prevPat = pat.first(); // Set<E> nextPat = pat.second(); // Set<E> prevnextPat = pat.third(); if (PatternFactory.ignoreWordRegex.matcher(token.word()).matches()) continue; // if the target word/phrase does not satisfy the POS requirement String tag = token.tag(); if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.containsKey(label)) { boolean use = false; for (String allowed : constVars.allowedTagsInitials.get(label)) { if (tag.startsWith(allowed)) { use = true; break; } } if (!use) continue; } // if the target word/phrase does not satisfy the NER requirements String nertag = token.ner(); if (constVars.allowedNERsforLabels != null && constVars.allowedNERsforLabels.containsKey(label)) { if (!constVars.allowedNERsforLabels.get(label).contains(nertag)) { continue; } } if (token.get(answerClass4Label).equals(label)) { // Positive for (E s : pats) { posWords.add(new Pair<>(s, longestMatchingPhrase)); } } else { // Negative or unlabeled boolean negToken = false; Map<Class, Object> ignore = constVars.getIgnoreWordswithClassesDuringSelection().get(label); for (Class igCl : ignore.keySet()) if ((Boolean) token.get(igCl)) { negToken = true; break; } if (!negToken) if (constVars.getOtherSemanticClassesWords().contains(token.word()) || constVars.getOtherSemanticClassesWords().contains(token.lemma())) negToken = true; if(!negToken){ for(String labelA : constVars.getLabels()){ if(!labelA.equals(label)){ if(constVars.getSeedLabelDictionary().get(labelA).contains(longestMatchingPhrase) || constVars.getSeedLabelDictionary().get(labelA).contains(tokenWordOrLemma) || constVars.getLearnedWords(labelA).containsKey(longestMatchingPhrase) || constVars.getLearnedWords(labelA).containsKey(tokenWordOrLemma)){ negToken = true; break; } } } } for (E sindex : pats) { if (negToken) { negWords.add(new Pair<>(sindex, longestMatchingPhrase)); } else { unlabWords.add(new Pair<>(sindex, longestMatchingPhrase)); } } } } } return new Triple<List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>, List<Pair<E, CandidatePhrase>>>(posWords, negWords, unlabWords); } } private Set<E> enforceMinSupportRequirements(TwoDimensionalCounter<E, CandidatePhrase> patternsandWords4Label, TwoDimensionalCounter<E, CandidatePhrase> unLabeledPatternsandWords4Label) { Set<E> remove = new HashSet<>(); for (Entry<E, ClassicCounter<CandidatePhrase>> en : patternsandWords4Label.entrySet()) { if (en.getValue().size() < constVars.minPosPhraseSupportForPat) { remove.add(en.getKey()); } } int numRemoved = remove.size(); Redwood.log(Redwood.DBG, "Removing " + numRemoved + " patterns that do not meet minPosPhraseSupportForPat requirement of >= " + constVars.minPosPhraseSupportForPat); for (Entry<E, ClassicCounter<CandidatePhrase>> en : unLabeledPatternsandWords4Label.entrySet()) { if (en.getValue().size() < constVars.minUnlabPhraseSupportForPat) { remove.add(en.getKey()); } } Redwood.log(Redwood.DBG, "Removing " + (remove.size() - numRemoved) + " patterns that do not meet minUnlabPhraseSupportForPat requirement of >= " + constVars.minUnlabPhraseSupportForPat); return remove; } // void removeLearnedPattern(String label, E p) { // this.learnedPatterns.get(label).remove(p); // if (wordsPatExtracted.containsKey(label)) // for (Entry<String, ClassicCounter<E>> en : this.wordsPatExtracted.get(label).entrySet()) { // en.getValue().remove(p); // } // } private void removeLearnedPatterns(String label, Collection<E> pats) { Counters.removeKeys(this.learnedPatterns.get(label), pats); for(Map.Entry<Integer, Counter<E>> en: this.learnedPatternsEachIter.get(label).entrySet()) Counters.removeKeys(en.getValue(), pats); if (wordsPatExtracted.containsKey(label)) for (Entry<CandidatePhrase, ClassicCounter<E>> en : this.wordsPatExtracted.get(label).entrySet()) { Counters.removeKeys(en.getValue(), pats); } } public static <E> Counter<E> normalizeSoftMaxMinMaxScores(Counter<E> scores, boolean minMaxNorm, boolean softmax, boolean oneMinusSoftMax) { double minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE; Counter<E> newscores = new ClassicCounter<>(); if (softmax) { for (Entry<E, Double> en : scores.entrySet()) { Double score = null; if (oneMinusSoftMax) score = (1 / (1 + Math.exp(Math.min(7, en.getValue())))); else score = (1 / (1 + Math.exp(-1 * Math.min(7, en.getValue())))); if (score < minScore) minScore = score; if (score > maxScore) maxScore = score; newscores.setCount(en.getKey(), score); } } else { newscores.addAll(scores); minScore = Counters.min(newscores); maxScore = Counters.max(newscores); } if (minMaxNorm) { for (Entry<E, Double> en : newscores.entrySet()) { double score; if (minScore == maxScore) score = minScore; else score = (en.getValue() - minScore + 1e-10) / (maxScore - minScore); newscores.setCount(en.getKey(), score); } } return newscores; } public TwoDimensionalCounter<String, ScorePhraseMeasures> phInPatScoresCache = new TwoDimensionalCounter<>(); public void labelWords(String label, Map<String, DataInstance> sents, Collection<CandidatePhrase> identifiedWords) throws IOException { CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>(); labelWords(label, sents, identifiedWords, null, matchedTokensByPat); } public void labelWords(String label, Map<String, DataInstance> sents, Collection<CandidatePhrase> identifiedWords, String outFile, CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat) throws IOException { Date startTime = new Date(); Redwood.log(Redwood.DBG, "Labeling " + sents.size() + " sentences with " + identifiedWords.size() + " phrases for label " + label); int numTokensLabeled = 0; CollectionValuedMap<String, Integer> tokensMatchedPatterns = null; if (constVars.restrictToMatched) { tokensMatchedPatterns = new CollectionValuedMap<>(); for (Entry<E, Collection<Triple<String, Integer, Integer>>> en : matchedTokensByPat.entrySet()) { for (Triple<String, Integer, Integer> en2 : en.getValue()) { for (int i = en2.second(); i <= en2.third(); i++) { tokensMatchedPatterns.add(en2.first(), i); } } } } Map<String, Map<Integer, Set<E>>> tempPatsForSents = new HashMap<>(); for (Entry<String, DataInstance> sentEn : sents.entrySet()) { List<CoreLabel> tokens = sentEn.getValue().getTokens(); boolean sentenceChanged = false; Map<CandidatePhrase, String[]> identifiedWordsTokens = new HashMap<>(); for (CandidatePhrase s : identifiedWords) { String[] toks = s.getPhrase().split("\\s+"); identifiedWordsTokens.put(s, toks); } String[] sent = new String[tokens.size()]; int i = 0; Set<Integer> contextWordsRecalculatePats = new HashSet<>(); for (CoreLabel l :tokens) { sent[i] = l.word(); i++; } for (Entry<CandidatePhrase, String[]> phEn : identifiedWordsTokens.entrySet()) { String[] ph = phEn.getValue(); List<Integer> ints = ArrayUtils.getSubListIndex(ph, sent, o -> constVars.matchLowerCaseContext ? ((String) o.first()).equalsIgnoreCase((String)o.second()): o.first().equals(o.second())); if (ints == null) continue; for (Integer idx : ints) { boolean donotuse = false; if (constVars.restrictToMatched) { for (int j = 0; j < ph.length; j++) { if (!tokensMatchedPatterns.get(sentEn.getKey()).contains(idx + j)) { Redwood.log(ConstantsAndVariables.extremedebug, "not labeling " + tokens.get(idx + j).word()); donotuse = true; break; } } } if (donotuse == false) { String phStr = StringUtils.join(ph, " "); if(constVars.writeMatchedTokensIdsForEachPhrase) addToMatchedTokensByPhrase(phStr, sentEn.getKey(), idx, ph.length); Redwood.log(ConstantsAndVariables.extremedebug,"Labeling because of phrase " + phStr); for (int j = 0; j < ph.length; j++) { int index = idx + j; CoreLabel l = tokens.get(index); if (constVars.usePatternResultAsLabel) { sentenceChanged = true; l.set(constVars.getAnswerClass().get(label), label); numTokensLabeled ++; //set the matched and the longest phrases CollectionValuedMap<String, CandidatePhrase> matched = new CollectionValuedMap<>(); matched.add(label, phEn.getKey()); if(!l.containsKey(PatternsAnnotations.MatchedPhrases.class)) l.set(PatternsAnnotations.MatchedPhrases.class, matched); else l.get(PatternsAnnotations.MatchedPhrases.class).addAll(matched); CandidatePhrase longest = l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class).get(label); longest = longest != null && longest.getPhrase().length() > phEn.getKey().getPhrase().length() ? longest: phEn.getKey(); l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class).put(label, longest); for (int k = Math.max(0, index - PatternFactory.numWordsCompoundMapped.get(label)); k < tokens.size() && k <= index + PatternFactory.numWordsCompoundMapped.get(label) + 1; k++) { contextWordsRecalculatePats.add(k); } } } } } } if (patsForEachToken != null )//&& patsForEachToken.containsSentId(sentEn.getKey())) { for (int index : contextWordsRecalculatePats){ if(!tempPatsForSents.containsKey(sentEn.getKey())) tempPatsForSents.put(sentEn.getKey(), new HashMap<>()); tempPatsForSents.get(sentEn.getKey()).put(index, Pattern.getContext(constVars.patternType, sentEn.getValue(), index, ConstantsAndVariables.getStopWords())); //patsForEachToken.addPatterns(sentEn.getKey(), index, createPats.getContext(sentEn.getValue(), index)); } } if(sentenceChanged){ constVars.invertedIndex.update(sentEn.getValue().getTokens(), sentEn.getKey()); } } if(patsForEachToken != null) { patsForEachToken.updatePatterns(tempPatsForSents);//sentEn.getKey(), index, createPats.getContext(sentEn.getValue(), index)); } constVars.invertedIndex.finishUpdating(); if (outFile != null) { Redwood.log(ConstantsAndVariables.minimaldebug, "Writing results to " + outFile); IOUtils.writeObjectToFile(sents, outFile); } Date endTime = new Date(); Redwood.log(Redwood.DBG, "Done labeling provided sents in " + elapsedTime(startTime, endTime) + ". Total # of tokens labeled: " + numTokensLabeled); } public void iterateExtractApply() throws IOException, ClassNotFoundException { iterateExtractApply(null, null, null); } /** * * @param p0 Null in most cases. only used for BPB * @param p0Set Null in most cases * @param ignorePatterns * */ public void iterateExtractApply(Map<String, E> p0, Map<String, Counter<CandidatePhrase>> p0Set, Map<String, Set<E>> ignorePatterns) throws IOException, ClassNotFoundException { Map<String, CollectionValuedMap<E, Triple<String, Integer, Integer>>> matchedTokensByPatAllLabels = new HashMap<>(); //Map<String, Collection<Triple<String, Integer, Integer>>> matchedTokensForPhrases = new HashMap<String, Collection<Triple<String, Integer, Integer>>>(); Map<String, TwoDimensionalCounter<CandidatePhrase, E>> termsAllLabels = new HashMap<>(); Map<String, Set<CandidatePhrase>> ignoreWordsAll = new HashMap<>(); for (String label : constVars.getSeedLabelDictionary().keySet()) { matchedTokensByPatAllLabels.put(label, new CollectionValuedMap<>()); termsAllLabels.put(label, new TwoDimensionalCounter<>()); if (constVars.useOtherLabelsWordsasNegative) { Set<CandidatePhrase> w = new HashSet<>(); for (Entry<String, Set<CandidatePhrase>> en : constVars.getSeedLabelDictionary().entrySet()) { if (en.getKey().equals(label)) continue; w.addAll(en.getValue()); } ignoreWordsAll.put(label, w); } } Redwood.log(ConstantsAndVariables.minimaldebug, "Iterating " + constVars.numIterationsForPatterns + " times."); Map<String, BufferedWriter> wordsOutput = new HashMap<>(); Map<String, BufferedWriter> patternsOutput = new HashMap<>(); for (String label : constVars.getLabels()) { if(constVars.outDir != null){ IOUtils.ensureDir(new File(constVars.outDir + "/" + constVars.identifier + "/" + label)); String wordsOutputFileLabel = constVars.outDir + "/" + constVars.identifier + "/" + label + "/learnedwords.txt"; wordsOutput.put(label, new BufferedWriter(new FileWriter(wordsOutputFileLabel))); Redwood.log(ConstantsAndVariables.minimaldebug, "Saving the learned words for label " + label + " in " + wordsOutputFileLabel); } if(constVars.outDir != null){ String patternsOutputFileLabel = constVars.outDir + "/" + constVars.identifier + "/" + label + "/learnedpatterns.txt"; patternsOutput.put(label, new BufferedWriter(new FileWriter(patternsOutputFileLabel))); Redwood.log(ConstantsAndVariables.minimaldebug, "Saving the learned patterns for label " + label + " in " + patternsOutputFileLabel); } } for (int i = 0; i < constVars.numIterationsForPatterns; i++) { Redwood .log(ConstantsAndVariables.minimaldebug, "\n\n################################ Iteration " + (i + 1) + " ##############################"); boolean keepRunning = false; Map<String, Counter<CandidatePhrase>> learnedWordsThisIter = new HashMap<>(); for (String label : constVars.getLabels()) { Redwood.log(ConstantsAndVariables.minimaldebug, "\n###Learning for label " + label + " ######"); String sentout = constVars.sentsOutFile == null ? null : constVars.sentsOutFile + "_" + label; Pair<Counter<E>, Counter<CandidatePhrase>> learnedPatWords4label = iterateExtractApply4Label(label, p0 != null ? p0.get(label) : null, p0Set != null ? p0Set.get(label) : null, wordsOutput.get(label), sentout, patternsOutput.get(label), ignorePatterns != null ? ignorePatterns.get(label) : null, ignoreWordsAll.get(label), matchedTokensByPatAllLabels.get(label), termsAllLabels.get(label), i + numIterationsLoadedModel); learnedWordsThisIter.put(label, learnedPatWords4label.second()); if (learnedPatWords4label.first().size() > 0 && constVars.getLearnedWords(label).size() < constVars.maxExtractNumWords) { keepRunning = true; } } if (constVars.useOtherLabelsWordsasNegative) { for (String label : constVars.getLabels()) { for (Entry<String, Counter<CandidatePhrase>> en : learnedWordsThisIter.entrySet()) { if (en.getKey().equals(label)) continue; ignoreWordsAll.get(label).addAll(en.getValue().keySet()); } } } if (!keepRunning) { if (!constVars.tuneThresholdKeepRunning) { Redwood.log(ConstantsAndVariables.minimaldebug, "No patterns learned for all labels. Ending iterations."); break; } else { constVars.thresholdSelectPattern = 0.8 * constVars.thresholdSelectPattern; Redwood.log(ConstantsAndVariables.minimaldebug, "\n\nTuning thresholds to keep running. New Pattern threshold is " + constVars.thresholdSelectPattern); } } } if (constVars.outDir != null && !constVars.outDir.isEmpty()) { Redwood.log(ConstantsAndVariables.minimaldebug, "Writing justification files"); for (String label : constVars.getLabels()) { IOUtils.ensureDir(new File(constVars.outDir + "/" + constVars.identifier + "/" + label)); if (constVars.writeMatchedTokensFiles) { ConstantsAndVariables.DataSentsIterator iter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); int i = 0; String suffix =""; while(iter.hasNext()){ i++; if(constVars.batchProcessSents) suffix = "_"+i; writeMatchedTokensAndSents(label, iter.next().first(), suffix, matchedTokensByPatAllLabels.get(label)); } } } if(constVars.writeMatchedTokensIdsForEachPhrase && constVars.outDir != null){ String matchedtokensfilename = constVars.outDir + "/" + constVars.identifier + "/tokenids4matchedphrases" + ".json"; IOUtils.writeStringToFile(matchedTokensByPhraseJsonString(), matchedtokensfilename, "utf8"); } } System.out.println("\n\nAll patterns learned:"); for(Map.Entry<String, Map<Integer, Counter<E>>> en2: this.learnedPatternsEachIter.entrySet()) { System.out.println(en2.getKey()+":"); for (Map.Entry<Integer, Counter<E>> en : en2.getValue().entrySet()) { System.out.println("Iteration " + en.getKey()); System.out.println(StringUtils.join(en.getValue().keySet(), "\n")); } } System.out.println("\n\nAll words learned:"); for(String label: constVars.getLabels()) { System.out.println("\nLabel " + label +"\n"); for (Entry<Integer, Counter<CandidatePhrase>> en : this.constVars.getLearnedWordsEachIter(label).entrySet()) { System.out.println("Iteration " + en.getKey() + ":\t\t" + en.getValue().keySet()); } } // close all the writers for (String label : constVars.getLabels()) { if(wordsOutput.containsKey(label) && wordsOutput.get(label) != null) wordsOutput.get(label).close(); if(patternsOutput.containsKey(label) && patternsOutput.get(label) != null) patternsOutput.get(label).close(); } } private void writeMatchedTokensAndSents(String label, Map<String, DataInstance> sents, String suffix, CollectionValuedMap<E, Triple<String, Integer, Integer>> tokensMatchedPat) throws IOException { if(constVars.outDir != null){ Set<String> allMatchedSents = new HashSet<>(); String matchedtokensfilename = constVars.outDir + "/" + constVars.identifier + "/" + label + "/tokensmatchedpatterns" + suffix + ".json"; JsonObjectBuilder pats = Json.createObjectBuilder(); for (Entry<E, Collection<Triple<String, Integer, Integer>>> en : tokensMatchedPat.entrySet()) { CollectionValuedMap<String, Pair<Integer, Integer>> matchedStrs = new CollectionValuedMap<>(); for (Triple<String, Integer, Integer> en2 : en.getValue()) { allMatchedSents.add(en2.first()); matchedStrs.add(en2.first(), new Pair<>(en2.second(), en2.third())); } JsonObjectBuilder senttokens = Json.createObjectBuilder(); for (Entry<String, Collection<Pair<Integer, Integer>>> sen : matchedStrs.entrySet()) { JsonArrayBuilder obj = Json.createArrayBuilder(); for (Pair<Integer, Integer> sen2 : sen.getValue()) { JsonArrayBuilder startend = Json.createArrayBuilder(); startend.add(sen2.first()); startend.add(sen2.second()); obj.add(startend); } senttokens.add(sen.getKey(), obj); } pats.add(en.getKey().toStringSimple(), senttokens); } IOUtils.writeStringToFile(pats.build().toString(), matchedtokensfilename, "utf8"); // Writing the sentence json file -- tokens for each sentence JsonObjectBuilder senttokens = Json.createObjectBuilder(); for (String sentId : allMatchedSents) { JsonArrayBuilder sent = Json.createArrayBuilder(); for (CoreLabel l : sents.get(sentId).getTokens()) { sent.add(l.word()); } senttokens.add(sentId, sent); } String sentfilename = constVars.outDir + "/" + constVars.identifier + "/sentences" + suffix + ".json"; IOUtils.writeStringToFile(senttokens.build().toString(), sentfilename, "utf8"); } } public static String matchedTokensByPhraseJsonString(String phrase){ if(!Data.matchedTokensForEachPhrase.containsKey(phrase)) return ""; JsonArrayBuilder arrobj =Json.createArrayBuilder(); for (Entry<String, List<Integer>> sen : Data.matchedTokensForEachPhrase.get(phrase).entrySet()) { JsonObjectBuilder obj = Json.createObjectBuilder(); JsonArrayBuilder tokens = Json.createArrayBuilder(); for(Integer i : sen.getValue()){ tokens.add(i); } obj.add(sen.getKey(),tokens); arrobj.add(obj); } return arrobj.build().toString(); } public static String matchedTokensByPhraseJsonString(){ JsonObjectBuilder pats = Json.createObjectBuilder(); for (Entry<String, Map<String, List<Integer>>> en : Data.matchedTokensForEachPhrase.entrySet()) { JsonArrayBuilder arrobj =Json.createArrayBuilder(); for (Entry<String, List<Integer>> sen : en.getValue().entrySet()) { JsonObjectBuilder obj = Json.createObjectBuilder(); JsonArrayBuilder tokens = Json.createArrayBuilder(); for(Integer i : sen.getValue()){ tokens.add(i); } obj.add(sen.getKey(),tokens); arrobj.add(obj); } pats.add(en.getKey(), arrobj); } return pats.build().toString(); } //numIterTotal = numIter + iterations from previously loaded model! private Pair<Counter<E>, Counter<CandidatePhrase>> iterateExtractApply4Label(String label, E p0, Counter<CandidatePhrase> p0Set, BufferedWriter wordsOutput, String sentsOutFile, BufferedWriter patternsOut, Set<E> ignorePatterns, Set<CandidatePhrase> ignoreWords, CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat, TwoDimensionalCounter<CandidatePhrase, E> terms, int numIterTotal) throws IOException, ClassNotFoundException { if (!learnedPatterns.containsKey(label)) { learnedPatterns.put(label, new ClassicCounter<>()); } if (!learnedPatternsEachIter.containsKey(label)) { learnedPatternsEachIter.put(label, new HashMap<>()); } if (!constVars.getLearnedWordsEachIter().containsKey(label)) { constVars.getLearnedWordsEachIter().put(label, new TreeMap<>()); } // if (!constVars.getLearnedWords().containsKey(label)) { // constVars.getLearnedWords().put(label, new ClassicCounter<CandidatePhrase>()); // } Counter<CandidatePhrase> identifiedWords = new ClassicCounter<>(); Counter<E> patterns = new ClassicCounter<>(); Counter<E> patternThisIter = getPatterns(label, learnedPatterns.get(label).keySet(), p0, p0Set, ignorePatterns); patterns.addAll(patternThisIter); learnedPatterns.get(label).addAll(patterns); assert !learnedPatternsEachIter.get(label).containsKey(numIterTotal) : "How come learned patterns already have a key for " + numIterTotal + " keys are " + learnedPatternsEachIter.get(label).keySet(); learnedPatternsEachIter.get(label).put(numIterTotal, patterns); if (sentsOutFile != null) sentsOutFile = sentsOutFile + "_" + numIterTotal + "iter.ser"; Counter<String> scoreForAllWordsThisIteration = new ClassicCounter<>(); identifiedWords.addAll(scorePhrases.learnNewPhrases(label, this.patsForEachToken, patterns, learnedPatterns.get(label), matchedTokensByPat, scoreForAllWordsThisIteration, terms, wordsPatExtracted.get(label), this.patternsandWords.get(label), constVars.identifier, ignoreWords)); if (identifiedWords.size() > 0) { if (constVars.usePatternResultAsLabel) { if (constVars.getLabels().contains(label)) { ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while(sentsIter.hasNext()){ Pair<Map<String, DataInstance>, File> sentsf = sentsIter.next(); Redwood.log(Redwood.DBG, "labeling sentences from " + sentsf.second()); labelWords(label, sentsf.first(), identifiedWords.keySet(), sentsOutFile, matchedTokensByPat); //write only for batch sentences //TODO: make this clean! if(sentsf.second().exists() && constVars.batchProcessSents) IOUtils.writeObjectToFile(sentsf.first(), sentsf.second()); } } else throw new RuntimeException("why is the answer label null?"); assert !constVars.getLearnedWordsEachIter().get(label).containsKey(numIterTotal) : "How come learned words already have a key for " + numIterTotal; constVars.getLearnedWordsEachIter().get(label).put(numIterTotal, identifiedWords); } if (wordsOutput != null) { wordsOutput.write("\n" + Counters.toSortedString(identifiedWords, identifiedWords.size(), "%1$s", "\n")); wordsOutput.flush(); } } //} if (patternsOut != null) this.writePatternsToFile(patterns, patternsOut); return new Pair<>(patterns, identifiedWords); } private void writePatternsToFile(Counter<E> pattern, BufferedWriter outFile) throws IOException { for (Entry<E, Double> en : pattern.entrySet()) outFile.write(en.getKey().toString() + "\t" + en.getValue() + "\n"); } private void writeWordsToFile(Map<Integer, Counter<CandidatePhrase>> words, BufferedWriter outFile) throws IOException { for (Entry<Integer, Counter<CandidatePhrase>> en2 : words.entrySet()) { outFile.write("###Iteration " + en2.getKey()+"\n"); for (Entry<CandidatePhrase, Double> en : en2.getValue().entrySet()) outFile.write(en.getKey() + "\t" + en.getValue() + "\n"); } } private static TreeMap<Integer, Counter<CandidatePhrase>> readLearnedWordsFromFile(File file) { TreeMap<Integer, Counter<CandidatePhrase>> learned = new TreeMap<>(); Counter<CandidatePhrase> words = null; int numIter = -1; for (String line : IOUtils.readLines(file)) { if(line.startsWith("###")){ if(words != null) learned.put(numIter, words); numIter ++; words = new ClassicCounter<>(); continue; } String[] t = line.split("\t"); words.setCount(CandidatePhrase.createOrGet(t[0]), Double.parseDouble(t[1])); } if(words != null) learned.put(numIter, words); return learned; } public Counter<E> getLearnedPatterns(String label) { return this.learnedPatterns.get(label); } // public Counter<E> getLearnedPatternsSurfaceForm(String label) { // return this.learnedPatterns.get(label); // } public Map<String, Counter<E>> getLearnedPatterns() { return this.learnedPatterns; } public Map<String, Map<Integer, Counter<E>>> getLearnedPatternsEachIter() { return this.learnedPatternsEachIter; } public Map<Integer, Counter<E>> getLearnedPatternsEachIter(String label) { return this.learnedPatternsEachIter.get(label); } public void setLearnedPatterns(Counter<E> patterns, String label) { this.learnedPatterns.put(label, patterns); } /** * COPIED from CRFClassifier: Count the successes and failures of the model on * the given document. Fills numbers in to counters for true positives, false * positives, and false negatives, and also keeps track of the entities seen. <br> * Returns false if we ever encounter null for gold or guess. NOTE: The * current implementation of counting wordFN/FP is incorrect. */ public static boolean countResultsPerEntity(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN, String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN, Class<? extends TypesafeMap.Key<String>> whichClassToCompare) { int index = 0; int goldIndex = 0, guessIndex = 0; String lastGold = background, lastGuess = background; // As we go through the document, there are two events we might be // interested in. One is when a gold entity ends, and the other // is when a guessed entity ends. If the gold and guessed // entities end at the same time, started at the same time, and // match entity type, we have a true positive. Otherwise we // either have a false positive or a false negative. String str = ""; String s = ""; for (CoreLabel l : doc) { s += " " + l.word() + ":" + l.get(CoreAnnotations.GoldAnswerAnnotation.class) + ":" + l.get(whichClassToCompare); } for (CoreLabel line : doc) { String gold = line.get(CoreAnnotations.GoldAnswerAnnotation.class); String guess = line.get(whichClassToCompare); if (gold == null || guess == null) return false; if (lastGold != null && !lastGold.equals(gold) && !lastGold.equals(background)) { if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex) { wordTP.incrementCount(str); entityTP.incrementCount(lastGold, 1.0); } else { // System.out.println("false negative: " + str); wordFN.incrementCount(str); entityFN.incrementCount(lastGold, 1.0); str = ""; } } if (lastGuess != null && !lastGuess.equals(guess) && !lastGuess.equals(background)) { if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex && !lastGold.equals(gold)) { // correct guesses already tallied // str = ""; // only need to tally false positives } else { // System.out.println("false positive: " + str); entityFP.incrementCount(lastGuess, 1.0); wordFP.incrementCount(str); } str = ""; } if (lastGuess != null && lastGold != null && lastGold.equals(background) && lastGuess.equals(background)) { str = ""; } if (lastGold == null || !lastGold.equals(gold)) { lastGold = gold; goldIndex = index; } if (lastGuess == null || !lastGuess.equals(guess)) { lastGuess = guess; guessIndex = index; } ++index; if (str.isEmpty()) str = line.word(); else str += " " + line.word(); } // We also have to account for entities at the very end of the // document, since the above logic only occurs when we see // something that tells us an entity has ended if (lastGold != null && !lastGold.equals(background)) { if (lastGold.equals(lastGuess) && goldIndex == guessIndex) { entityTP.incrementCount(lastGold, 1.0); wordTP.incrementCount(str); } else { entityFN.incrementCount(lastGold, 1.0); wordFN.incrementCount(str); } str = ""; } if (lastGuess != null && !lastGuess.equals(background)) { if (lastGold.equals(lastGuess) && goldIndex == guessIndex) { // correct guesses already tallied } else { entityFP.incrementCount(lastGuess, 1.0); wordFP.incrementCount(str); } str = ""; } return true; } /** * Count the successes and failures of the model on the given document * ***token-based***. Fills numbers in to counters for true positives, false * positives, and false negatives, and also keeps track of the entities seen. <br> * Returns false if we ever encounter null for gold or guess. * * this currently is only for testing one label at a time */ public static void countResultsPerToken(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN, String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN, Class<? extends TypesafeMap.Key<String>> whichClassToCompare) { IOBUtils.countEntityResults(doc, entityTP, entityFP, entityFN, background); // int index = 0; // int goldIndex = 0, guessIndex = 0; // String lastGold = background, lastGuess = background; // As we go through the document, there are two events we might be // interested in. One is when a gold entity ends, and the other // is when a guessed entity ends. If the gold and guessed // entities end at the same time, started at the same time, and // match entity type, we have a true positive. Otherwise we // either have a false positive or a false negative. for (CoreLabel line : doc) { String gold = line.get(GoldAnswerAnnotation.class); String guess = line.get(whichClassToCompare); assert (gold != null) : "gold is null"; assert(guess != null) : "guess is null"; if (gold.equals(guess) && !gold.equalsIgnoreCase(background)) { entityTP.incrementCount(gold); wordTP.incrementCount(line.word()); } else if (!gold.equals(guess) && !gold.equalsIgnoreCase(background) && guess.equalsIgnoreCase(background)) { entityFN.incrementCount(gold); wordFN.incrementCount(line.word()); } else if (!gold.equals(guess) && !guess.equalsIgnoreCase(background) && gold.equalsIgnoreCase(background)) { wordFP.incrementCount(line.word()); entityFP.incrementCount(guess); } else if (gold.equals(guess) && !gold.equalsIgnoreCase(background)) { wordTN.incrementCount(line.word()); } else if (!(gold.equalsIgnoreCase(background) && guess.equalsIgnoreCase(background))) throw new RuntimeException("don't know reached here. not meant for more than one entity label: " + gold + " and " + guess); } } public static void countResults(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN, String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN, Class<? extends TypesafeMap.Key<String>> whichClassToCompare, boolean evalPerEntity) { if (evalPerEntity) { countResultsPerEntity(doc, entityTP, entityFP, entityFN, background, wordTP, wordTN, wordFP, wordFN, whichClassToCompare); } else { countResultsPerToken(doc, entityTP, entityFP, entityFN, background, wordTP, wordTN, wordFP, wordFN, whichClassToCompare); } } private void writeLabelDataSents(Map<String, DataInstance> sents, BufferedWriter writer) throws IOException { for (Entry<String, DataInstance> sent : sents.entrySet()) { writer.write(sent.getKey() + "\t"); Map<String, Boolean> lastWordLabeled = new HashMap<>(); for (String label : constVars.getLabels()) { lastWordLabeled.put(label, false); } for (CoreLabel s : sent.getValue().getTokens()) { String str = ""; //write them in reverse order List<String> listEndedLabels = new ArrayList<>(); //to first finish labels before starting List<String> startingLabels = new ArrayList<>(); for (Entry<String, Class<? extends TypesafeMap.Key<String>>> as : constVars.getAnswerClass().entrySet()) { String label = as.getKey(); boolean lastwordlabeled = lastWordLabeled.get(label); if (s.get(as.getValue()).equals(label)) { if (!lastwordlabeled) { startingLabels.add(label); } lastWordLabeled.put(label, true); } else { if (lastwordlabeled) { listEndedLabels.add(label); } lastWordLabeled.put(label, false); } } for(int i = listEndedLabels.size() -1 ; i >=0; i--) str += " </" + listEndedLabels.get(i) + ">"; for(String label : startingLabels){ str += " <" + label + "> "; } str += " " + s.word(); writer.write(str.trim() + " "); } writer.write("\n"); } } public void writeLabeledData(String outFile) throws IOException, ClassNotFoundException { BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while(sentsIter.hasNext()){ Pair<Map<String, DataInstance>, File> sentsf = sentsIter.next(); this.writeLabelDataSents(sentsf.first(), writer); } writer.close(); } static public void writeColumnOutput(String outFile, boolean batchProcessSents, Map<String, Class<? extends TypesafeMap.Key<String>>> answerclasses) throws IOException, ClassNotFoundException { BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(batchProcessSents); while(sentsIter.hasNext()){ Pair<Map<String, DataInstance>, File> sentsf = sentsIter.next(); writeColumnOutputSents(sentsf.first(), writer, answerclasses); } writer.close(); } private static void writeColumnOutputSents(Map<String, DataInstance> sents, BufferedWriter writer, Map<String, Class<? extends TypesafeMap.Key<String>>> answerclasses) throws IOException { for (Entry<String, DataInstance> sent : sents.entrySet()) { writer.write("\n\n" + sent.getKey() + "\n"); for (CoreLabel s : sent.getValue().getTokens()) { writer.write(s.word()+"\t"); Set<String> labels = new HashSet<>(); for (Entry<String, Class<? extends TypesafeMap.Key<String>>> as : answerclasses.entrySet()) { String label = as.getKey(); if (s.get(as.getValue()).equals(label)) { labels.add(label); } } if(labels.isEmpty()) writer.write("O\n"); else writer.write(StringUtils.join(labels,",")+"\n"); } writer.write("\n"); } } // public Map<String, DataInstance> loadJavaNLPAnnotatorLabeledFile(String // labeledFile, Properties props) throws FileNotFoundException { // System.out.println("Loading evaluate file " + labeledFile); // Map<String, DataInstance> sents = new HashMap<String, // DataInstance>(); // JavaNLPAnnotatorReaderAndWriter j = new JavaNLPAnnotatorReaderAndWriter(); // j.init(props); // Iterator<DataInstance> iter = j.getIterator(new BufferedReader(new // FileReader(labeledFile))); // int i = 0; // while (iter.hasNext()) { // i++; // DataInstance s = iter.next(); // String id = s.get(0).get(CoreAnnotations.DocIDAnnotation.class); // if (id == null) { // id = Integer.toString(i); // } // sents.put(id, s); // } // System.out.println("Read " + sents.size() + " eval sentences"); // return sents; // } // private void evaluate(String label, Map<String, DataInstance> sents) // throws IOException, InterruptedException, ExecutionException { // Redwood.log(Redwood.DBG, "labeling " + learnedWords.get(label)); // CollectionValuedMap<String, Integer> tokensMatchedPatterns = new // CollectionValuedMap<String, Integer>(); // // if (restrictToMatched) { // if (!alreadySetUp) // setUp(); // List<String> keyset = new ArrayList<String>(sents.keySet()); // int num = 0; // if (constVars.numThreads == 1) // num = keyset.size(); // else // num = keyset.size() / (constVars.numThreads - 1); // ExecutorService executor = Executors // .newFixedThreadPool(constVars.numThreads); // // Redwood.log(ConstantsAndVariables.minimaldebug, "keyset size is " + // // keyset.size()); // List<Future<Pair<TwoDimensionalCounter<Pair<String, String>, // SurfaceE>, CollectionValuedMap<String, Integer>>>> list = new // ArrayList<Future<Pair<TwoDimensionalCounter<Pair<String, String>, // SurfaceE>, CollectionValuedMap<String, Integer>>>>(); // for (int i = 0; i < constVars.numThreads; i++) { // // Redwood.log(ConstantsAndVariables.minimaldebug, "assigning from " + i * // // num + " till " + Math.min(keyset.size(), (i + 1) * num)); // // Callable<Pair<TwoDimensionalCounter<Pair<String, String>, SurfaceE>, // CollectionValuedMap<String, Integer>>> task = null; // task = new ApplyPatterns(keyset.subList(i * num, // Math.min(keyset.size(), (i + 1) * num)), // this.learnedPatterns.get(label), constVars.commonEngWords, // usePatternResultAsLabel, this.learnedWords.get(label).keySet(), // restrictToMatched, label, // constVars.removeStopWordsFromSelectedPhrases, // constVars.removePhrasesWithStopWords, constVars); // Future<Pair<TwoDimensionalCounter<Pair<String, String>, SurfaceE>, // CollectionValuedMap<String, Integer>>> submit = executor // .submit(task); // list.add(submit); // } // for (Future<Pair<TwoDimensionalCounter<Pair<String, String>, // SurfaceE>, CollectionValuedMap<String, Integer>>> future : list) { // Pair<TwoDimensionalCounter<Pair<String, String>, SurfaceE>, // CollectionValuedMap<String, Integer>> res = future // .get(); // tokensMatchedPatterns.addAll(res.second()); // } // executor.shutdown(); // } // // this.labelWords(label, sents, this.learnedWords.get(label).keySet(), // this.learnedPatterns.get(label).keySet(), null, tokensMatchedPatterns); // Counter<String> entityTP = new ClassicCounter<String>(); // Counter<String> entityFP = new ClassicCounter<String>(); // Counter<String> entityFN = new ClassicCounter<String>(); // for (Entry<String, DataInstance> sent : sents.entrySet()) { // for (CoreLabel l : sent.getValue()) { // if (l.containsKey(constVars.answerClass.get(label)) // && l.get(constVars.answerClass.get(label)) != null) // l.set(CoreAnnotations.AnswerAnnotation.class, // l.get(constVars.answerClass.get(label)).toString()); // if (!l.containsKey(CoreAnnotations.AnswerAnnotation.class) // || l.get(CoreAnnotations.AnswerAnnotation.class) == null) { // l.set(CoreAnnotations.AnswerAnnotation.class, // SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); // // } // // } // CRFClassifier.countResults(sent.getValue(), entityTP, entityFP, entityFN, // SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); // } // // Counter<String> precision = Counters.division(entityTP, // Counters.add(entityTP, entityFP)); // Counter<String> recall = Counters.division(entityTP, // Counters.add(entityTP, entityFN)); // Counter<String> fscore = Counters.getFCounter(precision, recall, 1.0); // System.out.println("Precision: " + precision); // System.out.println("Recall: " + recall); // System.out.println("FScore: " + fscore); // } public void evaluate(Map<String, DataInstance> testSentences, boolean evalPerEntity) throws IOException { for (Entry<String, Class<? extends Key<String>>> anscl : constVars.getAnswerClass().entrySet()) { String label = anscl.getKey(); Counter<String> entityTP = new ClassicCounter<>(); Counter<String> entityFP = new ClassicCounter<>(); Counter<String> entityFN = new ClassicCounter<>(); Counter<String> wordTP = new ClassicCounter<>(); Counter<String> wordTN = new ClassicCounter<>(); Counter<String> wordFP = new ClassicCounter<>(); Counter<String> wordFN = new ClassicCounter<>(); for (Entry<String, DataInstance> docEn : testSentences.entrySet()) { DataInstance doc = docEn.getValue(); List<CoreLabel> doceval = new ArrayList<>(); for (CoreLabel l : doc.getTokens()) { CoreLabel l2 = new CoreLabel(); l2.setWord(l.word()); if (l.get(anscl.getValue()).equals(label)) { l2.set(CoreAnnotations.AnswerAnnotation.class, label); } else l2.set(CoreAnnotations.AnswerAnnotation.class, constVars.backgroundSymbol); // If the gold label is not the label we are calculating the scores // for, set it to the background symbol if (!l.get(CoreAnnotations.GoldAnswerAnnotation.class).equals(label)) { l2.set(CoreAnnotations.GoldAnswerAnnotation.class, constVars.backgroundSymbol); } else l2.set(CoreAnnotations.GoldAnswerAnnotation.class, label); doceval.add(l2); } countResults(doceval, entityTP, entityFP, entityFN, constVars.backgroundSymbol, wordTP, wordTN, wordFP, wordFN, CoreAnnotations.AnswerAnnotation.class, evalPerEntity); // } System.out.println("False Positives: " + Counters.toSortedString(wordFP, wordFP.size(), "%s:%.2f", ";")); System.out.println("False Negatives: " + Counters.toSortedString(wordFN, wordFN.size(), "%s:%.2f", ";")); Redwood.log(Redwood.DBG, "\nFor label " + label + " True Positives: " + entityTP + "\tFalse Positives: " + entityFP + "\tFalse Negatives: " + entityFN); Counter<String> precision = Counters.division(entityTP, Counters.add(entityTP, entityFP)); Counter<String> recall = Counters.division(entityTP, Counters.add(entityTP, entityFN)); Redwood.log(ConstantsAndVariables.minimaldebug, "\nFor label " + label + " Precision: " + precision + ", Recall: " + recall + ", F1 score: " + FScore(precision, recall, 1)); // Redwood.log(ConstantsAndVariables.minimaldebug, "Total: " + // Counters.add(entityFP, entityTP)); } } public static <D> Counter<D> FScore(Counter<D> precision, Counter<D> recall, double beta) { double betasq = beta * beta; return Counters.divisionNonNaN(Counters.scale(Counters.product(precision, recall), (1 + betasq)), (Counters.add(Counters.scale(precision, betasq), recall))); } private static List<File> getAllFiles(String file) { List<File> allFiles = new ArrayList<>(); for (String tokfile : file.split("[,;]")) { File filef = new File(tokfile); if (filef.isDirectory()) { Redwood.log(Redwood.DBG, "Will read from directory " + filef); String path = ".*"; File dir = filef; for (File f : IOUtils.iterFilesRecursive(dir, java.util.regex.Pattern.compile(path))) { Redwood.log(ConstantsAndVariables.extremedebug, "Will read from file " + f); allFiles.add(f); } } else { if (filef.exists()) { Redwood.log(Redwood.DBG, "Will read from file " + filef); allFiles.add(filef); } else { Redwood.log(Redwood.DBG, "trying to read from file " + filef); //Is this a pattern? RegExFileFilter fileFilter = new RegExFileFilter(java.util.regex.Pattern.compile(filef.getName())); File dir = new File(tokfile.substring(0, tokfile.lastIndexOf("/"))); File[] files = dir.listFiles(fileFilter); allFiles.addAll(Arrays.asList(files)); } } } return allFiles; } private Pair<Double, Double> getPrecisionRecall(String label, Map<String, Boolean> goldWords4Label) { Set<CandidatePhrase> learnedWords = constVars.getLearnedWords(label).keySet(); int numcorrect = 0, numincorrect = 0; int numgoldcorrect = 0; for (Entry<String, Boolean> en : goldWords4Label.entrySet()) { if (en.getValue()) numgoldcorrect++; } Set<String> assumedNeg = new HashSet<>(); for (CandidatePhrase e : learnedWords) { if (!goldWords4Label.containsKey(e.getPhrase())) { assumedNeg.add(e.getPhrase()); numincorrect++; continue; } if (goldWords4Label.get(e.getPhrase())) { numcorrect++; } else numincorrect++; } if (!assumedNeg.isEmpty()) log.info("\nGold entity list does not contain words " + assumedNeg + " for label " + label + ". *****Assuming them as negative.******"); double precision = numcorrect / (double) (numcorrect + numincorrect); double recall = numcorrect / (double) (numgoldcorrect); return new Pair<>(precision, recall); } private static double FScore(double precision, double recall, double beta) { double betasq = beta * beta; return (1 + betasq) * precision * recall / (betasq * precision + recall); } public Set<String> getNonBackgroundLabels(CoreLabel l){ Set<String> labels = new HashSet<>(); for(Map.Entry<String, Class<? extends Key<String>>> en: constVars.getAnswerClass().entrySet()){ if(!l.get(en.getValue()).equals(constVars.backgroundSymbol)){ labels.add(en.getKey()); } } return labels; } public static Map<String, Set<CandidatePhrase>> readSeedWordsFromJSONString(String str){ Map<String, Set<CandidatePhrase>> seedWords = new HashMap<>(); JsonReader jsonReader = Json.createReader(new StringReader(str)); JsonObject obj = jsonReader.readObject(); jsonReader.close(); for (String o : obj.keySet()){ seedWords.put(o, new HashSet<>()); JsonArray arr = obj.getJsonArray(o); for(JsonValue v: arr) seedWords.get(o).add(CandidatePhrase.createOrGet(v.toString())); } return seedWords; } public static Map<String, Set<CandidatePhrase>> readSeedWords(Properties props) { String seedWordsFile = props.getProperty("seedWordsFiles"); if(seedWordsFile != null) return readSeedWords(seedWordsFile); else{ Redwood.log(Redwood.FORCE,"NO SEED WORDS FILES PROVIDED!!"); return Collections.emptyMap(); } } public static Map<String, Set<CandidatePhrase>> readSeedWords(String seedWordsFiles){ Map<String, Set<CandidatePhrase>> seedWords = new HashMap<>(); if (seedWordsFiles == null) { throw new RuntimeException( "Needs both seedWordsFiles and file parameters to run this class!\nseedWordsFiles has format: label1,filewithlistofwords1;label2,filewithlistofwords2;..."); } for (String seedFile : seedWordsFiles.split(";")) { String[] t = seedFile.split(","); String label = t[0]; Set<CandidatePhrase> seedWords4Label = new HashSet<>(); for(int i = 1; i < t.length; i++){ String seedWordsFile = t[i]; for(File fin: ConstantsAndVariables.listFileIncludingItself(seedWordsFile)){ Redwood.log(Redwood.DBG, "Reading seed words from " + fin + " for label " + label); for (String line : IOUtils.readLines(fin)) { line = line.trim(); if (line.isEmpty() || line.startsWith("#")) { continue; } line = line.split("\t")[0]; seedWords4Label.add(CandidatePhrase.createOrGet(line)); } } } seedWords.put(label, seedWords4Label); Redwood.log(ConstantsAndVariables.minimaldebug, "Number of seed words for label " + label + " is " + seedWords4Label.size()); } return seedWords; } void removeLabelings(String label, Collection<String> removeLabeledPhrases){ //TODO: write this up when appropriate } static Class[] printOptionClass = {String.class, Boolean.class, Integer.class, Long.class, Double.class, Float.class}; public Map<String, String> getAllOptions(){ Map<String, String> values = new HashMap<>(); props.forEach((x, y) -> values.put(x.toString(), y.toString())); values.putAll(constVars.getAllOptions()); //StringBuilder sb = new StringBuilder(); Class<?> thisClass; try { thisClass = Class.forName(this.getClass().getName()); Field[] aClassFields = thisClass.getDeclaredFields(); //sb.append(this.getClass().getSimpleName() + " [ "); for(Field f : aClassFields){ if(f.getGenericType().getClass().isPrimitive() || Arrays.binarySearch(printOptionClass, f.getType().getClass()) >= 0){ String fName = f.getName(); Object fvalue = f.get(this); values.put(fName, fvalue == null?"null":fvalue.toString()); //sb.append("(" + f.getType() + ") " + fName + " = " + f.get(this) + ", "); } } } catch (Exception e) { log.warn(e); } return values; } public static class Flags { static public String useTargetParserParentRestriction = "useTargetParserParentRestriction"; public static String useTargetNERRestriction = "useTargetNERRestriction"; public static String posModelPath = "posModelPath"; public static String numThreads = "numThreads"; public static String patternType = "patternType"; public static String numIterationsOfSavedPatternsToLoad = "numIterationsOfSavedPatternsToLoad"; public static String patternsWordsDir = "patternsWordsDir"; public static String loadModelForLabels = "loadModelForLabels"; } public static Pair<Map<String, DataInstance>,Map<String, DataInstance>> processSents(Properties props, Set<String> labels) throws IOException, ExecutionException, InterruptedException, ClassNotFoundException { String fileFormat = props.getProperty("fileFormat"); Map<String, DataInstance> sents = null; boolean batchProcessSents = Boolean.parseBoolean(props.getProperty("batchProcessSents", "false")); int numMaxSentencesPerBatchFile = Integer.parseInt(props.getProperty("numMaxSentencesPerBatchFile", String.valueOf(Integer.MAX_VALUE))); //works only for non-batch processing! boolean preserveSentenceSequence = Boolean.parseBoolean(props.getProperty("preserveSentenceSequence","false")); if (!batchProcessSents){ if(preserveSentenceSequence) sents = new LinkedHashMap<>(); else sents = new HashMap<>(); } else { Data.sentsFiles = new ArrayList<>(); Data.sentId2File = new ConcurrentHashMap<>(); } String file = props.getProperty("file"); String posModelPath = props.getProperty("posModelPath"); boolean lowercase = Boolean.parseBoolean(props.getProperty("lowercaseText")); boolean useTargetNERRestriction = Boolean.parseBoolean(props.getProperty("useTargetNERRestriction")); boolean useTargetParserParentRestriction = Boolean.parseBoolean(props.getProperty(Flags.useTargetParserParentRestriction)); boolean useContextNERRestriction = Boolean.parseBoolean(props.getProperty("useContextNERRestriction")); boolean addEvalSentsToTrain = Boolean.parseBoolean(props.getProperty("addEvalSentsToTrain","true")); String evalFileWithGoldLabels = props.getProperty("evalFileWithGoldLabels"); if (file == null && (evalFileWithGoldLabels == null || addEvalSentsToTrain == false)) { throw new RuntimeException("No training data! file is " + file + " and evalFileWithGoldLabels is " + evalFileWithGoldLabels + " and addEvalSentsToTrain is " + addEvalSentsToTrain); } if(props.getProperty(Flags.patternType) == null) throw new RuntimeException("PatternType not specified. Options are SURFACE and DEP"); PatternFactory.PatternType patternType = PatternFactory.PatternType.valueOf(props.getProperty(Flags.patternType)); // Read training file if (file != null) { String saveSentencesSerDirstr = props.getProperty("saveSentencesSerDir"); File saveSentencesSerDir = null; if (saveSentencesSerDirstr != null) { saveSentencesSerDir = new File(saveSentencesSerDirstr); if(saveSentencesSerDir.exists() && !fileFormat.equalsIgnoreCase("ser")) IOUtils.deleteDirRecursively(saveSentencesSerDir); IOUtils.ensureDir(saveSentencesSerDir); } String systemdir = System.getProperty("java.io.tmpdir"); File tempSaveSentencesDir = File.createTempFile("sents", ".tmp", new File(systemdir)); tempSaveSentencesDir.deleteOnExit(); tempSaveSentencesDir.delete(); tempSaveSentencesDir.mkdir(); int numFilesTillNow = 0; if (fileFormat == null || fileFormat.equalsIgnoreCase("text") || fileFormat.equalsIgnoreCase("txt")) { Map<String, DataInstance> sentsthis ; if(preserveSentenceSequence) sentsthis = new LinkedHashMap<>(); else sentsthis = new HashMap<>(); for (File f : GetPatternsFromDataMultiClass.getAllFiles(file)) { Redwood.log(Redwood.DBG, "Annotating text in " + f); //String text = IOUtils.stringFromFile(f.getAbsolutePath()); Iterator<String> reader = IOUtils.readLines(f).iterator(); while(reader.hasNext()){ numFilesTillNow = tokenize(reader, posModelPath, lowercase, useTargetNERRestriction || useContextNERRestriction, f.getName() + "-" + numFilesTillNow+"-", useTargetParserParentRestriction, props.getProperty(Flags.numThreads), batchProcessSents, numMaxSentencesPerBatchFile, saveSentencesSerDir == null? tempSaveSentencesDir : saveSentencesSerDir, sentsthis, numFilesTillNow, patternType); } if (!batchProcessSents) { sents.putAll(sentsthis); } } if (!batchProcessSents) { // for(Map.Entry<String, DataInstance> d: sents.entrySet()){ // for(CoreLabel l : d.getValue().getTokens()){ // for(String label: labels) { // if(l.containsKey(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class)){ // CandidatePhrase p = l.get(PatternsAnnotations.LongestMatchedPhraseForEachLabel.class).get(label); // } // } // } // } String outfilename= (saveSentencesSerDir == null ? tempSaveSentencesDir : saveSentencesSerDir) + "/sents_" + numFilesTillNow; if(saveSentencesSerDir != null) Data.inMemorySaveFileLocation = outfilename; Redwood.log(Redwood.FORCE, "Saving sentences in " + outfilename); IOUtils.writeObjectToFile(sents, outfilename); } } else if (fileFormat.equalsIgnoreCase("ser")) { for (File f : GetPatternsFromDataMultiClass.getAllFiles(file)) { Redwood.log(Redwood.DBG, "reading from ser file " + f); if (!batchProcessSents) sents.putAll((Map<String, DataInstance>) IOUtils.readObjectFromFile(f)); else{ File newf = new File(tempSaveSentencesDir.getAbsolutePath() + "/" + f.getAbsolutePath().replaceAll(java.util.regex.Pattern.quote("/"), "_")); IOUtils.cp(f, newf); Data.sentsFiles.add(newf); } } } else { throw new RuntimeException( "Cannot identify the file format. Valid values are text (or txt) and ser, where the serialized file is of the type Map<String, DataInstance>."); } } Map<String, DataInstance> evalsents = new HashMap<>(); boolean evaluate = Boolean.parseBoolean(props.getProperty("evaluate")); // Read Evaluation File if (evaluate) { if (evalFileWithGoldLabels != null) { String saveEvalSentencesSerFile = props.getProperty("saveEvalSentencesSerFile"); File saveEvalSentencesSerFileFile = null; if (saveEvalSentencesSerFile == null) { String systemdir = System.getProperty("java.io.tmpdir"); saveEvalSentencesSerFileFile = File.createTempFile("evalsents", ".tmp", new File(systemdir)); } else saveEvalSentencesSerFileFile = new File(saveEvalSentencesSerFile); Map setClassForTheseLabels = new HashMap<String, Class>(); //boolean splitOnPunct = Boolean.parseBoolean(props.getProperty("splitOnPunct", "true")); List<File> allFiles = GetPatternsFromDataMultiClass.getAllFiles(evalFileWithGoldLabels); int numFile = 0; String evalFileFormat = props.getProperty("evalFileFormat"); if (evalFileFormat == null || evalFileFormat.equalsIgnoreCase("text") || evalFileFormat.equalsIgnoreCase("txt") || evalFileFormat.startsWith("text")) { for (File f : allFiles) { numFile++; Redwood.log(Redwood.DBG, "Annotating text in " + f + ". Num file " + numFile); if(evalFileFormat.equalsIgnoreCase("textCoNLLStyle")){ Map<String, DataInstance> sentsEval = AnnotatedTextReader.parseColumnFile(new BufferedReader(new FileReader(f)), labels, setClassForTheseLabels, true, f.getName()); evalsents.putAll(runPOSNERParseOnTokens(sentsEval, props)); } else{ List<CoreMap> sentsCMs = AnnotatedTextReader.parseFile(new BufferedReader(new FileReader(f)), labels, setClassForTheseLabels, true, f.getName()); evalsents.putAll(runPOSNEROnTokens(sentsCMs, posModelPath, useTargetNERRestriction || useContextNERRestriction, "", useTargetParserParentRestriction, props.getProperty(Flags.numThreads), patternType)); } } } else if (fileFormat.equalsIgnoreCase("ser")) { for (File f : allFiles) { evalsents.putAll((Map<? extends String, ? extends DataInstance>) IOUtils.readObjectFromFile(f)); } } if (addEvalSentsToTrain) { Redwood.log(Redwood.DBG, "Adding " + evalsents.size() + " eval sents to the training set"); } IOUtils.writeObjectToFile(evalsents, saveEvalSentencesSerFileFile); if (batchProcessSents) { Data.sentsFiles.add(saveEvalSentencesSerFileFile); for(String k: evalsents.keySet()) Data.sentId2File.put(k, saveEvalSentencesSerFileFile); } else sents.putAll(evalsents); } } return new Pair<Map<String, DataInstance>,Map<String, DataInstance>>(sents, evalsents); } private void saveModel() throws IOException { String patternsWordsDirValue = props.getProperty("patternsWordsDir"); String patternsWordsDir; if (patternsWordsDirValue.endsWith(".zip")) { File temp = File.createTempFile("patswords", "dir"); temp.deleteOnExit(); temp.delete(); temp.mkdirs(); patternsWordsDir = temp.getAbsolutePath(); } else { patternsWordsDir = patternsWordsDirValue; } Redwood.log(Redwood.FORCE, "Saving output in " + patternsWordsDir); IOUtils.ensureDir(new File(patternsWordsDir)); //writing properties file String outPropertiesFile = patternsWordsDir+"model.properties"; props.store(new BufferedWriter(new FileWriter(outPropertiesFile)), "trained model properties file"); for (String label : constVars.getLabels()) { IOUtils.ensureDir(new File(patternsWordsDir + "/" + label)); BufferedWriter seedW = new BufferedWriter(new FileWriter(patternsWordsDir+"/"+label+"/seedwords.txt")); for(CandidatePhrase p : constVars.getSeedLabelDictionary().get(label)){ seedW.write(p.getPhrase()+"\n"); } seedW.close(); Map<Integer, Counter<E>> pats = getLearnedPatternsEachIter(label); IOUtils.writeObjectToFile(pats, patternsWordsDir + "/" + label + "/patternsEachIter.ser"); BufferedWriter w = new BufferedWriter(new FileWriter(patternsWordsDir + "/" + label + "/phrases.txt")); writeWordsToFile(constVars.getLearnedWordsEachIter(label), w); //Write env writeClassesInEnv(constVars.env, ConstantsAndVariables.globalEnv, patternsWordsDir + "/env.txt"); //Write the token mapping if (constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) IOUtils.writeStringToFile(Token.toStringClass2KeyMapping(), patternsWordsDir + "/tokenenv.txt", "utf8"); w.close(); } // if (patternsWordsDirValue.endsWith(".zip")) { // Redwood.log("Saving the zipped model to " + patternsWordsDirValue); // zip(patternsWordsDir, patternsWordsDirValue); // } } private void evaluate(Map<String, DataInstance> evalsents) throws IOException { if(constVars.goldEntitiesEvalFiles !=null) { for (String label : constVars.getLabels()) { if(constVars.goldEntities.containsKey(label)){ Pair<Double, Double> pr = getPrecisionRecall(label, constVars.goldEntities.get(label)); Redwood.log(ConstantsAndVariables.minimaldebug, "\nFor label " + label + ": Number of gold entities is " + constVars.goldEntities.get(label).size() + ", Precision is " + df.format(pr.first() * 100) + ", Recall is " + df.format(pr.second() * 100) + ", F1 is " + df.format(FScore(pr.first(), pr.second(), 1.0) * 100) + "\n\n"); } } } if(evalsents.size() > 0){ boolean evalPerEntity = Boolean.parseBoolean(props.getProperty("evalPerEntity", "true")); evaluate(evalsents, evalPerEntity); } if (evalsents.size() == 0 && constVars.goldEntitiesEvalFiles == null) log.info("No eval sentences or list of gold entities provided to evaluate! Make sure evalFileWithGoldLabels or goldEntitiesEvalFiles is set, or turn off the evaluate flag"); } /** * Execute the system give a properties file or object. Returns the model created * @param props */ public static<E extends Pattern> GetPatternsFromDataMultiClass<E> run(Properties props) throws IOException, ClassNotFoundException, IllegalAccessException, InterruptedException, ExecutionException, InstantiationException, NoSuchMethodException, InvocationTargetException, SQLException { Map<String, Set<CandidatePhrase>> seedWords = readSeedWords(props); Map<String, Class> answerClasses = new HashMap<>(); String ansClasses = props.getProperty("answerClasses"); if (ansClasses != null) { for (String l : ansClasses.split(";")) { String[] t = l.split(","); String label = t[0]; String cl = t[1]; Class answerClass = ClassLoader.getSystemClassLoader().loadClass(cl); answerClasses.put(label, answerClass); } } //process all the sentences here! Pair<Map<String, DataInstance>, Map<String, DataInstance>> sentsPair = processSents(props, seedWords.keySet()); boolean labelUsingSeedSets = Boolean.parseBoolean(props.getProperty("labelUsingSeedSets", "true")); GetPatternsFromDataMultiClass<E> model = new GetPatternsFromDataMultiClass<>(props, sentsPair.first(), seedWords, labelUsingSeedSets); return runNineYards(model, props, sentsPair.second()); } private static<E extends Pattern> GetPatternsFromDataMultiClass<E> runNineYards(GetPatternsFromDataMultiClass<E> model, Properties props, Map<String, DataInstance> evalsents) throws IOException, ClassNotFoundException { ArgumentParser.fillOptions(model, props); // If you want to reuse patterns and words learned previously (may be on another dataset etc) boolean loadSavedPatternsWordsDir = Boolean.parseBoolean(props.getProperty("loadSavedPatternsWordsDir")); //#################### Load already save pattersn and phrases if (loadSavedPatternsWordsDir) loadFromSavedPatternsWordsDir(model , props); if (model.constVars.learn) { Map<String, E> p0 = new HashMap<>(); Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<>(); Map<String, Set<E>> ignorePatterns = new HashMap<>(); model.iterateExtractApply(p0, p0Set, ignorePatterns); } //############ Write Output files if (model.constVars.markedOutputTextFile != null) model.writeLabeledData(model.constVars.markedOutputTextFile); if(model.constVars.columnOutputFile != null) writeColumnOutput(model.constVars.columnOutputFile, model.constVars.batchProcessSents, model.constVars.getAnswerClass()); //###################### SAVE MODEL if(model.constVars.savePatternsWordsDir) model.saveModel(); //######## EVALUATE ###########################3 boolean evaluate = Boolean.parseBoolean(props.getProperty("evaluate")); if (evaluate && evalsents != null) { model.evaluate(evalsents); } if(model.constVars.saveInvertedIndex){ model.constVars.invertedIndex.saveIndex(model.constVars.invertedIndexDirectory); } if(model.constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.LUCENE)){ model.patsForEachToken.close(); } return model; } static int numIterationsLoadedModel = 0; // static void unzip(String file, String outputDir) throws IOException { // ZipFile zipFile = new ZipFile(file); // Enumeration<? extends ZipEntry> entries = zipFile.entries(); // while (entries.hasMoreElements()) { // ZipEntry entry = entries.nextElement(); // Path entryDestination = new File(outputDir, entry.getName()).toPath(); // entryDestination.toFile().getParentFile().mkdirs(); // if (entry.isDirectory()) // entryDestination.toFile().mkdirs(); // else { // InputStream in = zipFile.getInputStream(entry); // Files.copy(in, entryDestination); // in.close(); // } // } // } // // static void zip(String directory, String outputFileName) throws IOException { // FileOutputStream fos = new FileOutputStream(outputFileName); // ZipOutputStream zos = new ZipOutputStream(fos); // //level - the compression level (0-9) // zos.setLevel(9); // addFolder(zos, directory, directory); // zos.close(); // } /** copied from http://www.justexample.com/wp/compress-folder-into-zip-file-using-java/ */ private static void addFolder(ZipOutputStream zos,String folderName,String baseFolderName) throws IOException { File f = new File(folderName); if(f.exists()){ if(f.isDirectory()){ if(!folderName.equalsIgnoreCase(baseFolderName)){ String entryName = folderName.substring(baseFolderName.length()+1,folderName.length()) + File.separatorChar; System.out.println("Adding folder entry " + entryName); ZipEntry ze= new ZipEntry(entryName); zos.putNextEntry(ze); } File[] f2 = f.listFiles(); for (File aF2 : f2) { addFolder(zos, aF2.getAbsolutePath(), baseFolderName); } }else{ //add file //extract the relative name for entry purpose String entryName = folderName.substring(baseFolderName.length()+1,folderName.length()); ZipEntry ze= new ZipEntry(entryName); zos.putNextEntry(ze); FileInputStream in = new FileInputStream(folderName); int len; byte[] buffer = new byte[1024]; while ((len = in.read(buffer)) < 0) { zos.write(buffer, 0, len); } in.close(); zos.closeEntry(); System.out.println("OK!"); } }else{ System.out.println("File or directory not found " + folderName); } } public static<E extends Pattern> Map<E, String> loadFromSavedPatternsWordsDir(GetPatternsFromDataMultiClass<E> model, Properties props) throws IOException, ClassNotFoundException { boolean labelSentsUsingModel = Boolean.parseBoolean(props.getProperty("labelSentsUsingModel","true")); boolean applyPatsUsingModel = Boolean.parseBoolean(props.getProperty("applyPatsUsingModel","true")); int numIterationsOfSavedPatternsToLoad = Integer.parseInt(props.getProperty(Flags.numIterationsOfSavedPatternsToLoad,String.valueOf(Integer.MAX_VALUE))); Map<E, String> labelsForPattterns = new HashMap<>(); String patternsWordsDirValue = props.getProperty(Flags.patternsWordsDir); String patternsWordsDir; // if(patternsWordsDirValue.endsWith(".zip")){ // File tempdir = File.createTempFile("patternswordsdir","dir"); // tempdir.deleteOnExit(); // tempdir.delete(); // tempdir.mkdirs(); // patternsWordsDir = tempdir.getAbsolutePath(); // unzip(patternsWordsDirValue, patternsWordsDir); // }else patternsWordsDir = patternsWordsDirValue; String sentsOutFile = props.getProperty("sentsOutFile"); String loadModelForLabels = props.getProperty(Flags.loadModelForLabels); List<String> loadModelForLabelsList = null; if(loadModelForLabels != null) loadModelForLabelsList = Arrays.asList(loadModelForLabels.split("[,;]")); for (String label : model.constVars.getLabels()) { if(loadModelForLabels != null && !loadModelForLabelsList.contains(label)) continue; assert (new File(patternsWordsDir + "/" + label).exists()) : "Why does the directory " + patternsWordsDir + "/" + label + " not exist?"; readClassesInEnv(patternsWordsDir + "/env.txt", model.constVars.env, ConstantsAndVariables.globalEnv); //Read the token mapping if(model.constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) Token.setClass2KeyMapping(new File(patternsWordsDir+"/tokenenv.txt")); //Load Patterns File patf = new File(patternsWordsDir + "/" + label + "/patternsEachIter.ser"); if (patf.exists()) { Map<Integer, Counter<E>> patterns = IOUtils.readObjectFromFile(patf); if(numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE){ Set<Integer> toremove = new HashSet<>(); for(Integer i : patterns.keySet()){ if(i >= numIterationsOfSavedPatternsToLoad){ System.out.println("Removing patterns from iteration " + i); toremove.add(i); } } for(Integer i: toremove) patterns.remove(i); } Counter<E> pats = Counters.flatten(patterns); for(E p : pats.keySet()){ labelsForPattterns.put(p, label); } numIterationsLoadedModel = Math.max(numIterationsLoadedModel, patterns.size()); model.setLearnedPatterns(pats, label); model.setLearnedPatternsEachIter(patterns, label); Redwood.log(Redwood.DBG, "Loaded " + model.getLearnedPatterns().get(label).size() + " patterns from " + patf); } //Load Words File wordf = new File(patternsWordsDir + "/" + label + "/phrases.txt"); if (wordf.exists()) { TreeMap<Integer, Counter<CandidatePhrase>> words = GetPatternsFromDataMultiClass.readLearnedWordsFromFile(wordf); model.constVars.setLearnedWordsEachIter(words, label); if(numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE){ Set<Integer> toremove = new HashSet<>(); for(Integer i : words.keySet()){ if(i >= numIterationsOfSavedPatternsToLoad){ System.out.println("Removing patterns from iteration " + i); toremove.add(i); } } for(Integer i: toremove) words.remove(i); } numIterationsLoadedModel = Math.max(numIterationsLoadedModel, words.size()); Redwood.log(Redwood.DBG, "Loaded " + words.size() + " phrases from " + wordf); } CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>(); Iterator<Pair<Map<String, DataInstance>, File>> sentsIter = new ConstantsAndVariables.DataSentsIterator(model.constVars.batchProcessSents); TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted = new TwoDimensionalCounter<>(); Set<CandidatePhrase> alreadyLabeledWords = new HashSet<>(); while(sentsIter.hasNext()){ Pair<Map<String, DataInstance>, File> sents = sentsIter.next(); if(labelSentsUsingModel){ Redwood.log(Redwood.DBG, "labeling sentences from " + sents.second() + " with the already learned words"); assert sents.first() != null : "Why are sents null"; model.labelWords(label, sents.first(), model.constVars.getLearnedWords(label).keySet(), sentsOutFile, matchedTokensByPat); if(sents.second().exists()) IOUtils.writeObjectToFile(sents, sents.second()); } if (model.constVars.restrictToMatched || applyPatsUsingModel) { Redwood.log(Redwood.DBG,"Applying patterns to " + sents.first().size() + " sentences"); model.constVars.invertedIndex.add(sents.first(), true); model.constVars.invertedIndex.add(sents.first(), true); model.scorePhrases.applyPats(model.getLearnedPatterns(label), label, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } Counters.addInPlace(model.wordsPatExtracted.get(label), wordsandLemmaPatExtracted); System.out.println("All Extracted phrases are " + wordsandLemmaPatExtracted.firstKeySet()); } System.out.flush(); System.err.flush(); return labelsForPattterns; } private void setLearnedPatternsEachIter(Map<Integer, Counter<E>> patterns, String label) { this.learnedPatternsEachIter.put(label, patterns); } private static void readClassesInEnv(String s, Map<String, Env> env, Env globalEnv) throws ClassNotFoundException { for(String line: IOUtils.readLines(s)){ String[] toks = line.split("###"); if(toks.length == 3){ String label = toks[0]; String name = toks[1]; Class c = Class.forName(toks[2]); if(!env.containsKey(label)) env.put(label, TokenSequencePattern.getNewEnv()); env.get(label).bind(name, c); }else if(toks.length ==2){ String name = toks[0]; Class c = Class.forName(toks[1]); assert c!=null : " Why is name for " + toks[1] + " null"; globalEnv.bind(name, c); }else throw new RuntimeException("Ill formed env file!"); } } private static void writeClassesInEnv(Map<String, Env> env, Env globalEnv, String file) throws IOException { BufferedWriter w = new BufferedWriter(new FileWriter(file)); for(Entry<String, Env> en: env.entrySet()){ for(Entry<String, Object> en2: en.getValue().getVariables().entrySet()){ if(en2.getValue() instanceof Class) w.write(en.getKey()+"###"+en2.getKey()+"###"+((Class)en2.getValue()).getName()+"\n"); } } for(Entry<String, Object> en2: globalEnv.getVariables().entrySet()){ if(en2.getValue() instanceof Class) w.write(en2.getKey()+"###"+ ((Class)en2.getValue()).getName()+"\n"); } w.close(); } public static String elapsedTime(Date d1, Date d2){ try{ Duration period = Duration.between(d1.toInstant(), d2.toInstant()); // Note: this will become easier with Java 9, using toDaysPart() etc. long days = period.toDays(); period = period.minusDays(days); long hours = period.toHours(); period = period.minusHours(hours); long minutes = period.toMinutes(); period = period.minusMinutes(minutes); long seconds = period.getSeconds(); return days + " days, " + hours + " hours, " + minutes + " minutes, " + seconds + " seconds"; } catch(java.lang.IllegalArgumentException e) { log.warn(e); } return ""; } public static void main(String[] args) { try { Properties props = StringUtils.argsToPropertiesWithResolve(args); GetPatternsFromDataMultiClass.<SurfacePattern>run(props); } catch (OutOfMemoryError e) { System.out.println("Out of memory! Either change the memory alloted by running as java -mx20g ... for example if you want to allocate 20G. Or consider using batchProcessSents and numMaxSentencesPerBatchFile flags"); log.warn(e); } catch (Exception e) { log.warn(e); } } }