package arkref.analysis; import java.util.*; import arkref.analysis.Types.Gender; import arkref.data.FirstNames; import arkref.data.Mention; import arkref.parsestuff.AnalysisUtilities; import arkref.parsestuff.TregexPatternFactory; import edu.stanford.nlp.trees.HeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; /** in all cases, null indicates unknown; that is, our system does not know. **/ public class Types { public static enum Gender { Male, Female; public String toString() { switch(this) { case Male: return "Mal"; case Female: return "Fem"; } return null; } } /** TODO: clarify, what's the difference between MaybePer and null ? **/ public static enum Personhood { Person, NotPerson, MaybePerson; public String toString() { switch(this) { case Person: return "Per"; case NotPerson: return "NPer"; case MaybePerson: return "MaybePer"; } return null; } } public static enum Number { Singular, Plural; public String toString() { switch(this) { case Singular: return "Sg"; case Plural: return "Pl"; } return null; } } /** First person, second person, third person. aka "grammatical number". **/ public static enum Perspective { First, Second, Third; public String toString() { switch(this) { case First: return "1"; case Second: return "2"; case Third: return "3"; default: return null; } } } public static <T> boolean relaxedEquals(T x, T y) { if (x==null || y==null) return true; return x==y; } public static boolean sexistGenderEquals(Gender x, Gender y) { // see testDefaultMale() // unknown gender defaults to male // unknown gender cannot match female if (x==null && y==null) return true; if (x==null && y==Gender.Male) return true; if (y==null && x==Gender.Male) return true; return x==y; } public static boolean personhoodEquals(Personhood x, Personhood y) { // see testEntityTypeMatching(), testThey() if( (x==null || x==Personhood.NotPerson || x==Personhood.MaybePerson) && (y==null || y==Personhood.NotPerson || y==Personhood.MaybePerson)) { return true; } if( (x==Personhood.Person || x==Personhood.MaybePerson) && (y==Personhood.Person || y==Personhood.MaybePerson)) { return true; } return x==y; } public static boolean checkPronominalMatch(Mention mention, Mention cand) { assert isPronominal(mention); String pronoun = pronoun(mention); if (!isPronominal(cand) && perspective(pronoun) != Perspective.Third) { // testFirstPerson return false; } // this hurts recall a good bit (!) // if (isPronominal(cand) && perspective(pronoun) != perspective(cand)) // return false; // using lax test on personhood because i don't know how to get it for most common nouns // number is easiest to get // gender is gray area return personhoodEquals(personhood(pronoun), personhood(cand)) && sexistGenderEquals(gender(mention), gender(cand)) && relaxedEquals(number(mention), number(cand)) && // "they" should be able to match singular nouns for groups true; } public static boolean isReflexive(Mention m) { return m.getHeadWord().matches("^(itself|yourself|myself|himself|herself|themselves|ourselves)$"); } public static boolean isPronominal(Mention m) { if (m.node()==null) return false; TregexMatcher matcher = TregexPatternFactory.getPattern("NP <<# /^PRP/ !>> NP").matcher(m.node()); return matcher.find(); } public static String pronoun(Mention m) { TregexPattern pat = TregexPatternFactory.getPattern("NP=np <<# /^PRP/=pronoun !>> NP"); TregexMatcher matcher = pat.matcher(m.node()); if (matcher.find()) { Tree PRP = matcher.getNode("pronoun"); return pronoun(PRP); } else { return null; } } public static String pronoun(Tree PRP) { Tree c = PRP.getChild(0); assert c.isLeaf(); String p = c.label().toString().toLowerCase(); return p; } public static Gender gender(Mention m) { if (m.node()==null) return null; if (isPronominal(m)) { String p = pronoun(m); if (p.matches("^(he|him|his|himself)$")) { return Gender.Male; } else if (p.matches("^(she|her|hers|herself)$")) { return Gender.Female; } else if (p.matches("^(it|its|itself)$")) { return null; // return Gender.Neuter; } else { return null; // no decision } } //if its something other than PERSON or other (e.g., LOCATION) //then return null because its obviously not male or female. String neType = m.neType(); if(!neType.equalsIgnoreCase("PERSON") && !neType.equalsIgnoreCase("noun.person") && !neType.equals("O")){ return null; } Gender firstNameGender = genderByFirstNamesOrTitles(m); return firstNameGender; } private static Gender genderByFirstNamesOrTitles(Mention m) { if (m.node()==null) return null; // TODO we can still figure something out, right //Go through all the NNP tokens in the noun phrase and see if any of them //are person names. If so, return the gender of that name. //Note: this will fail for ambiguous month/person names like "April" Tree head = m.node().headPreTerminal(AnalysisUtilities.getInstance().getHeadFinder()); Tree root = m.getSentence().rootNode(); for(Tree leaf : m.node().getLeaves()){ //System.err.println(head+"\t"+leaf+"\t"+head.parent(root)+"\t"+leaf.parent(root)); if(!leaf.parent(m.node()).label().value().equals("NNP") || leaf.parent(root).parent(root) != head.parent(root)) //must be a sibling of the head node, as in "(NP (NNP John) (POS 's))" { continue; } String genderS = FirstNames.getInstance().getGenderString(leaf.value()); if(genderS.equals("Mal") || leaf.value().equals("Mr.")){ return Gender.Male; }else if(genderS.equals("Fem") || leaf.value().equals("Mrs.") || leaf.value().equals("Ms.")){ return Gender.Female; } } return null; } private static Personhood personhoodByTitle(Mention m) { if (m.node()==null) return null; // TODO we can still figure something out, right if(personTitles == null){ personTitles = new HashSet<String>(); String [] personTitlesArray = {"Mr.","Mrs.","Dr.","Fr.","Drs.","Ms."}; for(int i=0; i<personTitlesArray.length; i++) personTitles.add(personTitlesArray[i].toLowerCase()); } Tree head = m.node().headPreTerminal(AnalysisUtilities.getInstance().getHeadFinder()); Tree root = m.getSentence().rootNode(); for(Tree leaf : m.node().getLeaves()){ //System.err.println(head+"\t"+leaf+"\t"+head.parent(root)+"\t"+leaf.parent(root)); if(!leaf.parent(m.node()).label().value().equals("NNP") || leaf.parent(root).parent(root) != head.parent(root)) //must be a sibling of the head node, as in "(NP (NNP John) (POS 's))" { continue; } if(personTitles.contains(leaf.value().toLowerCase())){ return Personhood.Person; } } return Personhood.MaybePerson; } public static Personhood personhood(Mention m) { if (isPronominal(m)) { String p = pronoun(m); return personhood(p); } String t = m.neType(); if (t.equalsIgnoreCase("PERSON") || t.equalsIgnoreCase("noun.person") // || NounTypes.getInstance().getType(m.getHeadWord()).equals("person") || genderByFirstNamesOrTitles(m) != null || personhoodByTitle(m) == Personhood.Person) return Personhood.Person; if (t.equals("O")) return null; return Personhood.NotPerson; } public static Personhood personhood(String pronoun) { if (pronoun.matches("^(me|he|him|his|she|her|hers|we|us|our|ours|i|my|mine|you|yours|himself|herself|ourselves|myself)$")) { return Personhood.Person; } else if (pronoun.matches("^(it|its|itself)$")) { return Personhood.NotPerson; }else if (pronoun.matches("^(they|their|theirs|them|these|those|themselves)$")) { return Personhood.MaybePerson; } return null; } /** what the heck is the real name for this? at least it is nice and reliably deterministic **/ public static Perspective perspective(String pronoun) { if (pronoun.matches("^(i|me||my|mine|we|our|ours|ourselves|myself)$")) { return Perspective.First; } else if (pronoun.matches("^(you|yours|y'all|y'alls|yinz|yourself)$")) { return Perspective.Second; } else { return Perspective.Third; } } public static Perspective perspective(Mention mention) { assert isPronominal(mention); return perspective(pronoun(mention)); } public static Number number(Mention m) { if (m.node()==null) return null; TregexPattern pat = TregexPatternFactory.getPattern("NP < CC|CONJP !>> NP"); TregexMatcher matcher = pat.matcher(m.node()); if(matcher.find()) { return Number.Plural; } if (isPronominal(m)) { String p = pronoun(m); if (p.matches("^(they|them|these|those|we|us|their|ours|our|theirs|themselves|ourselves)$")) { return Number.Plural; } else { //if (p.matches("^(it|its|that|this|he|him|his|she|her)$")) { return Number.Singular; } } else { HeadFinder hf = AnalysisUtilities.getInstance().getHeadFinder(); Tree head = m.node().headPreTerminal(hf); String tag = head.label().toString(); // Disable the organization type check -- gives only slim gains on ACE eval // and potentially complicates other analysis. // Causes data/they2 unit test to fail: TestArkref.testThey() //String headWord = head.getChild(0).label().value(); //if (NounTypes.getInstance().getType(headWord).equals("organization") // || NounTypes.getInstance().getType(headWord).equals("group") // || m.neType().equals("ORGANIZATION")) return null; // plural vs singular tags: http://bulba.sdsu.edu/jeanette/thesis/PennTags.html if (tag.matches("^NNP?S$")) return Number.Plural; if (tag.matches("^NNP?$")) return Number.Singular; // TODO mass nouns? } return null; } public static boolean isPossessive(Mention mention) { return mention.getHeadWord().matches("^(its|his|her|their|our|my)$"); } private static Set<String> personTitles; }