package edu.stanford.nlp.patterns; import edu.stanford.nlp.patterns.dep.DepPatternFactory; import edu.stanford.nlp.patterns.surface.SurfacePatternFactory; import edu.stanford.nlp.util.ArgumentParser; import java.util.*; /** * Created by sonalg on 10/27/14. */ public class PatternFactory { /** * allow to match stop words before a target term. This is to match something * like "I am on some X" if the pattern is "I am on X" */ @ArgumentParser.Option(name = "useStopWordsBeforeTerm") public static boolean useStopWordsBeforeTerm = false; /** * Add NER restriction to the target phrase in the patterns */ @ArgumentParser.Option(name = "useTargetNERRestriction") public static boolean useTargetNERRestriction = false; /** * */ @ArgumentParser.Option(name="useNER") public static boolean useNER = true; /** * Can just write a number (if same for all labels) or "Label1,2;Label2,3;...." */ @ArgumentParser.Option(name = "numWordsCompound") public static String numWordsCompound = "2"; public static Map<String, Integer> numWordsCompoundMapped = new HashMap<>(); public static int numWordsCompoundMax = 2; /** * Use lemma instead of words for the context tokens */ @ArgumentParser.Option(name = "useLemmaContextTokens") public static boolean useLemmaContextTokens = true; public static List<String> fillerWords = Arrays.asList("a", "an", "the", "`", "``", "'", "''"); /** * by default doesn't ignore anything. What phrases to ignore. */ public static java.util.regex.Pattern ignoreWordRegex = java.util.regex.Pattern.compile("a^"); public static void setUp(Properties props, PatternType patternType, Set<String> labels) { ArgumentParser.fillOptions(PatternFactory.class, props); numWordsCompoundMax = 0; if (numWordsCompound.contains(",") || numWordsCompound.contains(";")) { String[] toks = numWordsCompound.split(";"); for(String t: toks) { String[] toks2 = t.split(","); int numWords = Integer.valueOf(toks2[1]); numWordsCompoundMapped.put(toks2[0], numWords); if(numWords > numWordsCompoundMax){ numWordsCompoundMax = numWords; } } } else { numWordsCompoundMax = Integer.valueOf(numWordsCompound); for(String label: labels){ numWordsCompoundMapped.put(label, Integer.valueOf(numWordsCompound)); } } if(patternType.equals(PatternType.SURFACE)) SurfacePatternFactory.setUp(props); else if(patternType.equals(PatternType.DEP)) DepPatternFactory.setUp(props); else throw new UnsupportedOperationException(); } public enum PatternType{SURFACE, DEP}; public static boolean doNotUse(String word, Set<CandidatePhrase> stopWords) { if (stopWords.contains(CandidatePhrase.createOrGet(word.toLowerCase())) || ignoreWordRegex.matcher(word).matches()) return true; else return false; } public static Map<Integer, Set> getPatternsAroundTokens(PatternType patternType, DataInstance sent, Set<CandidatePhrase> stopWords) { if(patternType.equals(PatternType.SURFACE)){ return SurfacePatternFactory.getPatternsAroundTokens(sent, stopWords); } else if(patternType.equals(PatternType.DEP)){ return (Map) DepPatternFactory.getPatternsAroundTokens(sent, stopWords); } else throw new UnsupportedOperationException(); } }