package doser.entitydisambiguation.algorithms.rules; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; import doser.general.HelpfulMethods; class PatternRule extends AbstractRule { private static final int MINIMUMSURFACEFORMS = 14; private static final float OCCURRENCEPERCENTAGE = 0.25f; private static final String CUSTOMSTOPWORDLIST = "republic, people, party, national, a, about, above, across, after, again, against, all, almost, alone, along, already, also, although, always, am, among, an, and, another, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, at, away, b, back, backed, backing, backs, be, became, because, become, becomes, been, before, began, behind, being, beings, below, best, better, between, big, both, but, by, c, came, can, cannot, can't, case, cases, certain, certainly, clear, clearly, come, could, couldn't, d, did, didn't, differ, different, differently, do, does, doesn't, doing, done, don't, down, downed, downing, downs, during, e, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, f, face, faces, fact, facts, far, felt, few, find, finds, first, for, four, from, full, fully, further, furthered, furthering, furthers, g, gave, general, generally, get, gets, give, given, gives, go, going, good, goods, got, great, greater, greatest, group, grouped, grouping, groups, h, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, her, here, here's, hers, herself, he's, high, higher, highest, him, himself, his, how, however, how's, i, i'd, if, i'll, i'm, important, in, interest, interested, interesting, interests, into, is, isn't, it, its, it's, itself, i've, j, just, k, keep, keeps, kind, knew, know, known, knows, l, large, largely, last, later, latest, least, less, let, lets, let's, like, likely, long, longer, longest, m, made, make, making, man, many, may, me, member, members, men, might, more, most, mostly, mr, mrs, much, must, mustn't, my, myself, n, necessary, need, needed, needing, needs, never, new, newer, newest, next, no, nobody, non, noone, nor, not, nothing, now, nowhere, number, numbers, o, of, off, often, old, older, oldest, on, once, one, only, open, opened, opening, opens, or, order, ordered, ordering, orders, other, others, ought, our, ours, ourselves, out, over, own, p, part, parted, parting, parts, per, perhaps, place, places, point, pointed, pointing, points, possible, present, presented, presenting, presents, problem, problems, put, puts, q, quite, r, rather, really, right, room, rooms, s, said, same, saw, say, says, second, seconds, see, seem, seemed, seeming, seems, sees, several, shall, shan't, she, she'd, she'll, she's, should, shouldn't, show, showed, showing, shows, side, sides, since, small, smaller, smallest, so, some, somebody, someone, something, somewhere, state, states, still, such, sure, t, take, taken, than, that, that's, the, their, theirs, them, themselves, then, there, therefore, there's, these, they, they'd, they'll, they're, they've, thing, things, think, thinks, this, those, though, thought, thoughts, three, through, thus, to, today, together, too, took, toward, turn, turned, turning, turns, two, u, under, until, up, upon, us, use, used, uses, v, very, w, want, wanted, wanting, wants, was, wasn't, way, ways, we, we'd, well, we'll, wells, went, were, we're, weren't, we've, what, what's, when, when's, where, where's, whether, which, while, who, whole, whom, who's, whose, why, why's, will, with, within, without, won't, work, worked, working, works, would, wouldn't, x, y, year, years, yes, yet, you, you'd, you'll, young, younger, youngest, your, you're, yours, yourself, yourselves, you've, z"; PatternRule(EntityCentricKBDBpedia eckb) { super(eckb); } @Override public boolean applyRule(List<SurfaceForm> rep) { if (rep.size() > MINIMUMSURFACEFORMS) { Map<String, Integer> map = generateDictionary(rep); @SuppressWarnings("deprecation") List<Map.Entry<String, Integer>> list = HelpfulMethods .sortByValue(map); if(list.size() == 0) { return false; } Map.Entry<String, Integer> entry = list.get(0); String termToWatch = entry.getKey(); float perc = computePercentage(termToWatch, rep); if (perc > OCCURRENCEPERCENTAGE) { disambiguateTerms(termToWatch, rep); } } return false; } private Map<String, Integer> generateDictionary(List<SurfaceForm> rep) { // Check SurfaceForms HashMap Set<String> sfStrings = new HashSet<String>(); // Generate Dictionary Map<String, Integer> dictionary = new HashMap<String, Integer>(); for (SurfaceForm sf : rep) { List<String> strList = sf.getCandidates(); String s = sf.getSurfaceForm().toLowerCase(); if (!sfStrings.contains(s)) { Set<String> usedWords = new HashSet<String>(); for (String str : strList) { String ending = str.replaceAll( "http://dbpedia.org/resource/", "").toLowerCase(); String[] split = ending.split("_"); if (split.length > 1) { for (int i = 1; i < split.length; i++) { if (!usedWords.contains(split[i])) { if (!CUSTOMSTOPWORDLIST.contains(split[i])) { if (dictionary.containsKey(split[i])) { Integer in = dictionary.get(split[i]); dictionary.put(split[i], ++in); } else { dictionary.put(split[i], 1); } usedWords.add(split[i]); } } } } } sfStrings.add(s); } } return dictionary; } private float computePercentage(String str, List<SurfaceForm> rep) { int occ = 0; HashSet<String> hash = new HashSet<String>(); for (SurfaceForm sf : rep) { List<String> l = sf.getCandidates(); String form = sf.getSurfaceForm().toLowerCase(); if (!hash.contains(form)) { for (String s : l) { s = s.replaceAll("http://dbpedia.org/resource/", "") .toLowerCase(); if (s.contains("_"+str)) { occ++; break; } } hash.add(form); } } float perc = (float) occ / (float) rep.size(); return perc; } private void disambiguateTerms(String str, List<SurfaceForm> rep) { for (SurfaceForm sf : rep) { if (rep.size() > 1) { List<String> l = sf.getCandidates(); List<String> candidates = new ArrayList<String>(); for (String s : l) { String st = s .replaceAll("http://dbpedia.org/resource/", "") .toLowerCase(); if (st.contains("_" + str)) { candidates.add(s); } } if (candidates.size() == 1 && !candidates.get(0).matches(".*\\d+.*")) { sf.setDisambiguatedEntity(candidates.get(0)); sf.setInitial(true); } } } } public static void main(String[] args) { List<String> l1 = new ArrayList<String>(); l1.add("http://dbpedia.org/resource/Leicestershire"); l1.add("http://dbpedia.org/resource/Leicestershire_Cricket_Country_Club"); l1.add("http://dbpedia.org/resource/Leicestershire_Testing_F.C."); List<String> l2 = new ArrayList<String>(); l2.add("http://dbpedia.org/resource/Derbyshire"); l2.add("http://dbpedia.org/resource/Derbyshire_Cricket_Country_Club"); l2.add("http://dbpedia.org/resource/Derbyshire_Testing1_F.C."); List<String> l3 = new ArrayList<String>(); l3.add("http://dbpedia.org/resource/Essex"); l3.add("http://dbpedia.org/resource/Essex_Cricket_Country_Club"); l3.add("http://dbpedia.org/resource/Essex_Testing2_F.C."); List<String> l4 = new ArrayList<String>(); l4.add("London"); l4.add("London_Theatre"); l4.add("London_Theatre_Test_F.C."); SurfaceForm sf1 = new SurfaceForm("Leicestershire", "", l1, 0, 1); SurfaceForm sf2 = new SurfaceForm("Derbyshire", "", l2, 0, 1); SurfaceForm sf3 = new SurfaceForm("London", "", l3, 0, 1); SurfaceForm sf4 = new SurfaceForm("London", "", l4, 0, 1); List<SurfaceForm> sf = new LinkedList<SurfaceForm>(); sf.add(sf1); sf.add(sf2); sf.add(sf3); sf.add(sf4); PatternRule pattern = new PatternRule(null); pattern.applyRule(sf); } }