package com.zilbo.flamingSailor.TE.model; import org.apache.log4j.Logger; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.*; import java.util.regex.Pattern; /* * Copyright 2012 Zilbo.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class NamesMerge { private static final Logger logger = Logger.getLogger(NamesMerge.class); Map<String, Integer> nameMap; Map<String, String> nameRC; Set<String> ignored; Set<String> otherMap; Map<String, Set<String>> surnames; Pattern titleMatch = Pattern.compile("(mr|mrs|miss|dr|ms|messrs)\\.?", Pattern.CASE_INSENSITIVE); static public void main(String args[]) throws IOException { /* HashSet<String> ignored = new HashSet<String>(); ignored.add("toronto"); ignored.add("guelph"); ignored.add("calgary"); ignored.add("jupiter"); ignored.add("chair"); HashSet<String> s = new HashSet<String>(); s.add("charles sirois"); s.add("richard nesbitt"); */ NamesMerge nm = NamesMerge.readinTestNames("TE/test/names_2.txt"); // NamesMerge nm2 = NamesMerge.dedup(nm); for (Map.Entry<String, Integer> e : nm.getNames().entrySet()) { logger.info(e.getKey() + "\t" + e.getValue()); } logger.info("**\nSurnames\n**"); for (Map.Entry<String, Set<String>> e : nm.surnames.entrySet()) { logger.info(e.getKey() + "\t" + e.getValue()); } } public static NamesMerge readinTestNames(String fileName) throws IOException { return readinTestNames(fileName, null); } public static NamesMerge readinTestNames(String fileName, Set<String> ignored) throws IOException { BufferedReader r = new BufferedReader(new FileReader(fileName)); NamesMerge nm = new NamesMerge(ignored); Map<String, Integer> x = new HashMap<String, Integer>(); String line = r.readLine(); while (line != null) { String[] p = line.split("\t"); x.put(p[0], Integer.decode(p[1])); line = r.readLine(); } nm.addAll(x); return nm; } public NamesMerge() { nameMap = new HashMap<String, Integer>(); nameRC = new HashMap<String, String>(); ignored = new HashSet<String>(); otherMap = new HashSet<String>(); surnames = new HashMap<String, Set<String>>(); } public NamesMerge(Set<String> toIgnore) { nameMap = new HashMap<String, Integer>(); nameRC = new HashMap<String, String>(); ignored = new HashSet<String>(); otherMap = new HashSet<String>(); surnames = new HashMap<String, Set<String>>(); if (toIgnore != null) { for (String s : toIgnore) { ignored.add(s.toLowerCase()); } } } public NamesMerge(Set<String> toIgnore, Set<String> ignore2) { nameMap = new HashMap<String, Integer>(); ignored = new HashSet<String>(); surnames = new HashMap<String, Set<String>>(); for (String s : toIgnore) { ignored.add(s.toLowerCase()); } otherMap = new HashSet<String>(); for (String s : ignore2) { otherMap.add(s.toLowerCase()); } } public void add(String name) { this.add(name, 1); } public static NamesMerge dedup(NamesMerge orig) { SortedMap<String, Integer> names = new TreeMap<String, Integer>(); for (Map.Entry<String, Integer> n : orig.nameMap.entrySet()) { names.put(n.getKey(), n.getValue()); } NamesMerge nm2 = new NamesMerge(orig.ignored); nm2.nameRC = orig.nameRC; nm2.surnames = new HashMap<String, Set<String>>(orig.surnames); nm2.setOtherMap(orig.otherMap); for (Map.Entry<String, Integer> n : names.entrySet()) { nm2.add(n.getKey(), n.getValue()); } return nm2; } public boolean contains(String name) { return (this.nameMap.containsKey(name.trim().toLowerCase())); } public void addAll(Map<String, Integer> list) { if (list == null) { return; } for (Map.Entry<String, Integer> n : list.entrySet()) { nameRC.put(n.getKey().toLowerCase(), n.getKey()); this.add(n.getKey().toLowerCase(), n.getValue()); } NamesMerge n = dedup(this); this.nameMap = n.nameMap; } public void add(String name, Integer c) { if (ignored.contains(name)) { return; } // ignore strings with just titles if (titleMatch.matcher(name.trim()).matches()) { return; } String nPart[] = name.split(" ", 2); if (nPart.length > 1) { if (titleMatch.matcher(nPart[0]).matches()) { if ( !surnames.containsKey(nPart[1])) { surnames.put(nPart[1], new HashSet<String>()); } } else { String xnPart[] = name.split(" "); String potentialSurname = xnPart[xnPart.length-1]; Set<String> x = surnames.get(potentialSurname); if (x != null) { // if (x.isEmpty()) { x.add(name); surnames.put(potentialSurname, x); /* } else { if (x.length() > name.length()) { surnames.put(potentialSurname, name); } } */ } else { // don't just put a surname in here from a regular name, just use 100% mr. FOO ones // surnames.put(potentialSurname,new HashSet<String>()); } } } Integer count = nameMap.get(name); if (count == null || count == 0) { String parts[] = name.split(" "); if (parts.length < 2) { nameMap.put(name, c); } else { int i = 0; int len = parts.length; if (parts.length > 2) { if (ignored.contains(parts[0])) { i = 1; } if (ignored.contains(parts[len - 1])) { len = len - 1; } } if (len > 3) { // find XY XY Z... pattern, adding a 'XY' and leaving the 'Z' if (parts[i].equals(parts[i + 2]) && parts[i + 1].equals(parts[i + 3])) { this.add(parts[i] + " " + parts[i + 1]); i = i + 4; } } if (len > 5 && i+3<len) { String x = parts[i + 2] + " " + parts[i + 3]; if (nameMap.containsKey(x)) { this.add(parts[i] + " " + parts[i + 1]); this.add(parts[i + 2] + " " + parts[i + 3]); i += 4; } } if (i >= len) { return; } StringBuilder nameBuilder = new StringBuilder(parts[i]); i += 1; int startI = i; int startLen = len; while (i < len) { /* if ( titleMatch.matcher(parts[i]).matches()) { logger.info(nameBuilder); } */ if (nameBuilder == null) { if (ignored.contains(parts[i])) { i += 1; continue; } else { nameBuilder = new StringBuilder(parts[i]); } } else { nameBuilder.append(" ").append(parts[i]); } String potentialName = nameBuilder.toString(); if (!this.otherMap.contains(potentialName)) { count = nameMap.get(potentialName); if (count != null) { nameMap.put(potentialName, count + c); nameBuilder = null; } } else { nameBuilder = null; } i += 1; } if (nameBuilder != null) { i = startI; len = startLen; if (len - i > 1) { nameBuilder = new StringBuilder(parts[len - 1]); len -= 1; while (i <= len) { if (nameBuilder == null) { if (ignored.contains(parts[len - 1])) { len -= 1; continue; } else { nameBuilder = new StringBuilder(parts[len - 1]); } } else { StringBuilder sb = new StringBuilder(parts[len - 1]); sb.append(" ").append(nameBuilder); nameBuilder = sb; } String potentialName = nameBuilder.toString(); if (!this.otherMap.contains(potentialName)) { count = nameMap.get(potentialName); if (count != null) { nameMap.put(potentialName, count + c); nameBuilder = null; } } else { nameBuilder = null; } len -= 1; } } if ( nameBuilder!= null) { String potentialName = nameBuilder.toString(); if (!ignored.contains(potentialName)) { nameMap.put(potentialName, c); } } } } } else { nameMap.put(name, count + c); } } public Map<String, Integer> getNames() { Map<String, Integer> ret = new HashMap<String, Integer>(); for (Map.Entry<String, Integer> e : nameMap.entrySet()) { if (nameRC.containsKey(e.getKey())) { ret.put(nameRC.get(e.getKey()), e.getValue()); } else { ret.put(e.getKey(), e.getValue()); } } return ret; } public void setOtherMap(Set<String> otherMap) { this.otherMap = new HashSet<String>(); for (String s : otherMap) { this.otherMap.add(s.toLowerCase()); } } /** * returns true if we have seent the surname before * @param name the namestring * @return true if we have seen it * TODO * only returns true/false .. could possibly return the full name, but have to deal with us * having multiple different people with same surname (eg jason & phillip dezwirek ) */ public boolean surnameMatches(String name) { // Claudio Mannarino; String nPart[] = name.split(" ", 2); if (nPart.length > 1) { if (surnames.containsKey(nPart[1].toLowerCase().trim())) { return true; } } return false; } }