NamesMerge.java example

Explorer
flaming-sailor-master
- src
  - main
    - java
      - com
        scottlogic
        util
        Alteration.java
        AlterationList.java
        ArrayLocation.java
        NaturalSortedList.java
        SortedList.java
        UnsortedList.java
        zilbo
        flamingSailor
        TE
        PDFParser.java
        model
        Component.java
        GeomUtil.java
        MultiPartBlock.java
        NamesMerge.java
        PDLink.java
        TextLine.java
        TextLink.java
        TextPage.java
        TextPiece.java
        TextType
        Empty.java
        Nil.java
        Number.java
        Percent.java
        Text.java
        TextType.java
        Unknown.java
        Year.java
  - test
    - java
      - com
        zilbo
        flamingSailor
        TE
        TestSort.java
        TestStats.java
        model
        TestBoundingBox.java
        TestBoundingBox2.java
        TestConstructLines.java
        TestSplit.java
      - pdf2Text.java
package com.zilbo.flamingSailor.TE.model;

import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/*
 * Copyright 2012 Zilbo.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
public class NamesMerge {
    private static final Logger logger = Logger.getLogger(NamesMerge.class);
    Map<String, Integer> nameMap;
    Map<String, String> nameRC;
    Set<String> ignored;
    Set<String> otherMap;
    Map<String, Set<String>> surnames;
    Pattern titleMatch = Pattern.compile("(mr|mrs|miss|dr|ms|messrs)\\.?", Pattern.CASE_INSENSITIVE);

    static public void main(String args[]) throws IOException {

        /*
        HashSet<String> ignored = new HashSet<String>();
        ignored.add("toronto");
        ignored.add("guelph");
        ignored.add("calgary");
        ignored.add("jupiter");
        ignored.add("chair");
        HashSet<String> s = new HashSet<String>();
        s.add("charles sirois");
        s.add("richard nesbitt");
        */

        NamesMerge nm = NamesMerge.readinTestNames("TE/test/names_2.txt");

        //      NamesMerge nm2 = NamesMerge.dedup(nm);
        for (Map.Entry<String, Integer> e : nm.getNames().entrySet()) {
            logger.info(e.getKey() + "\t" + e.getValue());
        }
        logger.info("**\nSurnames\n**");
        for (Map.Entry<String, Set<String>> e : nm.surnames.entrySet()) {
            logger.info(e.getKey() + "\t" + e.getValue());
        }
    }

    public static NamesMerge readinTestNames(String fileName) throws IOException {
        return readinTestNames(fileName, null);
    }

    public static NamesMerge readinTestNames(String fileName, Set<String> ignored) throws IOException {

        BufferedReader r = new BufferedReader(new FileReader(fileName));

        NamesMerge nm = new NamesMerge(ignored);

        Map<String, Integer> x = new HashMap<String, Integer>();
        String line = r.readLine();
        while (line != null) {
            String[] p = line.split("\t");
            x.put(p[0], Integer.decode(p[1]));
            line = r.readLine();
        }
        nm.addAll(x);
        return nm;
    }


    public NamesMerge() {
        nameMap = new HashMap<String, Integer>();
        nameRC = new HashMap<String, String>();
        ignored = new HashSet<String>();
        otherMap = new HashSet<String>();
        surnames = new HashMap<String, Set<String>>();
    }

    public NamesMerge(Set<String> toIgnore) {
        nameMap = new HashMap<String, Integer>();
        nameRC = new HashMap<String, String>();
        ignored = new HashSet<String>();
        otherMap = new HashSet<String>();
        surnames = new HashMap<String, Set<String>>();

        if (toIgnore != null) {
            for (String s : toIgnore) {
                ignored.add(s.toLowerCase());
            }
        }
    }

    public NamesMerge(Set<String> toIgnore, Set<String> ignore2) {
        nameMap = new HashMap<String, Integer>();
        ignored = new HashSet<String>();
        surnames = new HashMap<String,  Set<String>>();
        for (String s : toIgnore) {
            ignored.add(s.toLowerCase());
        }
        otherMap = new HashSet<String>();
        for (String s : ignore2) {
            otherMap.add(s.toLowerCase());
        }
    }

    public void add(String name) {
        this.add(name, 1);
    }

    public static NamesMerge dedup(NamesMerge orig) {
        SortedMap<String, Integer> names = new TreeMap<String, Integer>();
        for (Map.Entry<String, Integer> n : orig.nameMap.entrySet()) {
            names.put(n.getKey(), n.getValue());
        }
        NamesMerge nm2 = new NamesMerge(orig.ignored);
        nm2.nameRC = orig.nameRC;
        nm2.surnames = new HashMap<String, Set<String>>(orig.surnames);
        nm2.setOtherMap(orig.otherMap);

        for (Map.Entry<String, Integer> n : names.entrySet()) {
            nm2.add(n.getKey(), n.getValue());
        }

        return nm2;
    }

    public boolean contains(String name) {
        return (this.nameMap.containsKey(name.trim().toLowerCase()));
    }

    public void addAll(Map<String, Integer> list) {
        if (list == null) {
            return;
        }
        for (Map.Entry<String, Integer> n : list.entrySet()) {
            nameRC.put(n.getKey().toLowerCase(), n.getKey());
            this.add(n.getKey().toLowerCase(), n.getValue());
        }
        NamesMerge n = dedup(this);
        this.nameMap = n.nameMap;
    }

    public void add(String name, Integer c) {

        if (ignored.contains(name)) {
            return;
        }
        // ignore strings with just titles
        if (titleMatch.matcher(name.trim()).matches()) {
            return;
        }
        String nPart[] = name.split(" ", 2);
        if (nPart.length > 1) {
            if (titleMatch.matcher(nPart[0]).matches()) {
                if ( !surnames.containsKey(nPart[1])) {
                    surnames.put(nPart[1], new HashSet<String>());
                }
            } else {
                String xnPart[] = name.split(" ");
                String potentialSurname = xnPart[xnPart.length-1];
                Set<String> x = surnames.get(potentialSurname);
                if (x != null) {
                  //  if (x.isEmpty()) {
                        x.add(name);
                        surnames.put(potentialSurname, x);
                  /*
                    } else {
                        if (x.length() > name.length()) {
                            surnames.put(potentialSurname, name);
                        }
                    }
                    */
                } else {
                    // don't just put a surname in here from a regular name, just use 100% mr. FOO ones
               //     surnames.put(potentialSurname,new HashSet<String>());
                }
            }
        }
        Integer count = nameMap.get(name);
        if (count == null || count == 0) {
            String parts[] = name.split(" ");
            if (parts.length < 2) {
                nameMap.put(name, c);
            } else {
                int i = 0;
                int len = parts.length;
                if (parts.length > 2) {
                    if (ignored.contains(parts[0])) {
                        i = 1;
                    }
                    if (ignored.contains(parts[len - 1])) {
                        len = len - 1;
                    }
                }
                if (len > 3) {
                    // find XY XY Z... pattern, adding a 'XY' and leaving the 'Z'
                    if (parts[i].equals(parts[i + 2]) && parts[i + 1].equals(parts[i + 3])) {
                        this.add(parts[i] + " " + parts[i + 1]);
                        i = i + 4;
                    }
                }
                if (len > 5 && i+3<len) {
                    String x = parts[i + 2] + " " + parts[i + 3];
                    if (nameMap.containsKey(x)) {
                        this.add(parts[i] + " " + parts[i + 1]);
                        this.add(parts[i + 2] + " " + parts[i + 3]);
                        i += 4;
                    }
                }

                if (i >= len) {
                    return;
                }
                StringBuilder nameBuilder = new StringBuilder(parts[i]);
                i += 1;
                int startI = i;
                int startLen = len;
                while (i < len) {
                    /*
                    if ( titleMatch.matcher(parts[i]).matches()) {
                        logger.info(nameBuilder);
                    }
                    */
                    if (nameBuilder == null) {
                        if (ignored.contains(parts[i])) {
                            i += 1;
                            continue;
                        } else {
                            nameBuilder = new StringBuilder(parts[i]);
                        }
                    } else {
                        nameBuilder.append(" ").append(parts[i]);
                    }
                    String potentialName = nameBuilder.toString();
                    if (!this.otherMap.contains(potentialName)) {
                        count = nameMap.get(potentialName);
                        if (count != null) {
                            nameMap.put(potentialName, count + c);
                            nameBuilder = null;
                        }
                    } else {
                        nameBuilder = null;
                    }
                    i += 1;
                }

                if (nameBuilder != null) {
                    i = startI;
                    len = startLen;
                    if (len - i > 1) {
                        nameBuilder = new StringBuilder(parts[len - 1]);
                        len -= 1;
                        while (i <= len) {
                            if (nameBuilder == null) {
                                if (ignored.contains(parts[len - 1])) {
                                    len -= 1;
                                    continue;
                                } else {
                                    nameBuilder = new StringBuilder(parts[len - 1]);
                                }
                            } else {
                                StringBuilder sb = new StringBuilder(parts[len - 1]);
                                sb.append(" ").append(nameBuilder);
                                nameBuilder = sb;
                            }
                            String potentialName = nameBuilder.toString();
                            if (!this.otherMap.contains(potentialName)) {
                                count = nameMap.get(potentialName);
                                if (count != null) {
                                    nameMap.put(potentialName, count + c);
                                    nameBuilder = null;
                                }
                            } else {
                                nameBuilder = null;
                            }
                            len -= 1;
                        }
                    }
                    if ( nameBuilder!= null) {
                        String potentialName = nameBuilder.toString();
                        if (!ignored.contains(potentialName)) {
                            nameMap.put(potentialName, c);
                        }
                    }
                }
            }
        } else {
            nameMap.put(name, count + c);
        }
    }

    public Map<String, Integer> getNames() {
        Map<String, Integer> ret = new HashMap<String, Integer>();
        for (Map.Entry<String, Integer> e : nameMap.entrySet()) {
            if (nameRC.containsKey(e.getKey())) {
                ret.put(nameRC.get(e.getKey()), e.getValue());
            } else {
                ret.put(e.getKey(), e.getValue());
            }
        }
        return ret;
    }

    public void setOtherMap(Set<String> otherMap) {
        this.otherMap = new HashSet<String>();
        for (String s : otherMap) {
            this.otherMap.add(s.toLowerCase());
        }
    }

    /**
     * returns true if we have seent the surname before
     * @param name the namestring
     * @return     true if we have seen it
     * TODO
     * only returns true/false .. could possibly return the full name, but have to deal with us
     * having multiple different people with same surname (eg jason & phillip dezwirek )
     */
    public boolean surnameMatches(String name) {
        //  Claudio Mannarino;
        String nPart[] = name.split(" ", 2);
        if (nPart.length > 1) {
            if (surnames.containsKey(nPart[1].toLowerCase().trim())) {
                return true;
            }
        }
        return false;
    }
}