NameListNormalizer.java example

Explorer
jabref-2.9.2-master
- src
/*  Copyright (C) 2003-2011 JabRef contributors.
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.util;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Class containing method(s) for normalizing author lists to BibTeX format.
 */
public class NameListNormalizer {

    static Pattern lastFF = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) (\\p{javaUpperCase}+)");
    static Pattern lastFdotF = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) ([\\. \\p{javaUpperCase}]+)");
    static Pattern FFlast = Pattern.compile("(\\p{javaUpperCase}+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
    static Pattern FdotFlast = Pattern.compile("([\\. \\p{javaUpperCase}]+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
    static Pattern singleName = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]*)");

    /*public static void main(String[] args) {
        normalizeAuthorList("Staci D. Bilbo and Smith SH and Jaclyn M Schwarz");
        //System.out.println(normalizeAuthorList("Ølver MA"));
        //System.out.println(normalizeAuthorList("Ølver MA, GG Øie, Øie GG, Alfredsen JÅÅ, Jo Alfredsen, Olsen Y.Y. and Olsen Y. Y."));
        //System.out.println(normalizeAuthorList("Ølver MA, GG Øie, Øie GG, Alfredsen JÅÅ, Jo Alfredsen, Olsen Y.Y., Olsen Y. Y."));
        //System.out.println(normalizeAuthorList("Alver, Morten and Alver, Morten O and Alfredsen, JA and Olsen, Y.Y."));
        //System.out.println(normalizeAuthorList("Alver, MA; Alfredsen, JA; Olsen Y.Y."));
    }*/

    public static String normalizeAuthorList(String in){
        boolean andSep = false, semicolonSep = false, commaSep = false;
        String author;
        String[] authors = in.split("( |,)and ",-1);
        if (authors.length > 1)
            andSep = true;
        else {
            /*
            If there are no "and" separators in the original string, we assume it either means that
            the author list is comma or semicolon separated or that it contains only a single name.
            If there is a semicolon, we go by that. If not, we assume commas, and count the parts
            separated by commas to determine which it is.
            */
            String[] a2 = in.split("; ");
            if (a2.length > 1) {
                semicolonSep = true;
                authors = a2;
            }
            else {
                a2 = in.split(", ");
                if (a2.length > 3) { // Probably more than a single author, so we split by commas.
                    commaSep = true;
                    authors = a2;
                } else {
                    if (a2.length == 3) {
                        // This could be a BibTeX formatted name containing a Jr particle,
                        // e.g. Smith, Jr., Peter
                        // We check if the middle part is <= 3 characters. If not, we assume we are
                        // dealing with three authors.
                        if (a2[1].length() > 3)
                            authors = a2;
                    }
                }
            }
        }

        // Remove leading and trailing whitespaces from each name:
        for (int i = 0; i < authors.length; i++){
            authors[i] = authors[i].trim();
        }

        // If we found an and separator, there could possibly be semicolon or
        // comma separation before the last separator. If there are two or more
        // and separators, we can dismiss this possibility.
        // If there is only a single and separator, check closer:
        if(andSep && (authors.length == 2)){
            // Check if the first part is semicolon separated:
            String[] semiSep = authors[0].split("; ");
            if (semiSep.length > 1) {
                // Ok, it looks like this is the case. Use separation by semicolons:
                String[] newAuthors = new String[1+semiSep.length];
                for (int i=0; i<semiSep.length; i++) {
                    newAuthors[i] = semiSep[i].trim();
                }
                newAuthors[semiSep.length] = authors[1];
                authors = newAuthors;
            }
            else {
                // Check if there is a comma in the last name. If so, we can assume that comma
                // is not used to separate the names:
                boolean lnfn = (authors[1].indexOf(",") > 0);
                if (!lnfn) {
                    String[] cmSep = authors[0].split(", ");
                    if (cmSep.length > 1) {
                        // This means that the last name doesn't contain a comma, but the first
                        // one contains one or more. This indicates that the names leading up to
                        // the single "and" are comma separated:
                        String[] newAuthors = new String[1+cmSep.length];
                        for (int i=0; i<cmSep.length; i++) {
                            newAuthors[i] = cmSep[i].trim();
                        }
                        newAuthors[cmSep.length] = authors[1];
                        authors = newAuthors;
                    }

                }
            }
        }

        StringBuilder sb = new StringBuilder();
        for (int i=0; i<authors.length; i++) {
            String norm = normalizeName(authors[i]);
            sb.append(norm);
            if (i < authors.length-1)
                sb.append(" and ");
        }
        return sb.toString();
    }


    public static String normalizeName(String name) {
        Matcher m = lastFF.matcher(name);
        if (m.matches()) {
            String initials = m.group(2);
            StringBuilder sb = new StringBuilder(m.group(1));
            sb.append(", ");
            for (int i=0; i<initials.length(); i++) {
                sb.append(initials.charAt(i));
                sb.append('.');
                if (i < initials.length()-1)
                    sb.append(' ');
            }
            return sb.toString();
        }
        m = lastFdotF.matcher(name);
        if (m.matches()) {
            String initials = m.group(2).replaceAll("[\\. ]+", "");
            StringBuilder sb = new StringBuilder(m.group(1));
            sb.append(", ");
            for (int i=0; i<initials.length(); i++) {
                sb.append(initials.charAt(i));
                sb.append('.');
                if (i < initials.length()-1)
                    sb.append(' ');
            }
            return sb.toString();
        }

        m = FFlast.matcher(name);
        if (m.matches()) {
            String initials = m.group(1);
            StringBuilder sb = new StringBuilder(m.group(2));
            sb.append(", ");
            for (int i=0; i<initials.length(); i++) {
                sb.append(initials.charAt(i));
                sb.append('.');
                if (i < initials.length()-1)
                    sb.append(' ');
            }
            return sb.toString();
        }
        m = FdotFlast.matcher(name);
        if (m.matches()) {
            String initials = m.group(1).replaceAll("[\\. ]+", "");
            StringBuilder sb = new StringBuilder(m.group(2));
            sb.append(", ");
            for (int i=0; i<initials.length(); i++) {
                sb.append(initials.charAt(i));
                sb.append('.');
                if (i < initials.length()-1)
                    sb.append(' ');
            }
            return sb.toString();
        }

        if (name.indexOf(',') >= 0) {
            // Name contains comma
            int index = name.lastIndexOf(',');
            // If the comma is at the end of the name, just remove it to prevent index error:
            if (index == name.length() - 1)
                name = name.substring(0, name.length()-1);

            StringBuilder sb = new StringBuilder(name.substring(0, index));
            sb.append(", ");
            // Check if the remainder is a single name:
            String fName = name.substring(index+1).trim();
            String[] fParts = fName.split(" ");
            if (fParts.length > 1) {
                // Multiple parts. Add all of them, and add a dot if they are single letter parts:
                for (int i=0; i<fParts.length; i++) {
                    if (fParts[i].length() == 1)
                        sb.append(fParts[i]+".");
                    else sb.append(fParts[i]);
                    if (i < fParts.length-1)
                        sb.append(" ");
                }
            } else {
                // Only a single part. Check if it looks like a name or initials:
                Matcher m2 = singleName.matcher(fParts[0]);
                if (m2.matches())
                    sb.append(fParts[0]);
                else {
                    // It looks like initials.
                    String initials = fParts[0].replaceAll("[\\.]+", "");
                    for (int i=0; i<initials.length(); i++) {
                        sb.append(initials.charAt(i));
                        sb.append('.');
                        if (i < initials.length()-1)
                            sb.append(' ');
                    }
                }

            }
            return sb.toString();
        } else {
            // Name doesn't contain comma
            String[] parts = name.split(" +");
            boolean allNames = true;
            for (int i = 0; i < parts.length; i++) {
                m = singleName.matcher(parts[i]);
                if (!m.matches()) {
                    allNames = false;
                    break;
                }
            }
            if (allNames) {
                // Looks like a name written in full with first name first.
                // Change into last name first format:
                StringBuilder sb = new StringBuilder(parts[parts.length-1]);
                if (parts.length > 1) {
                    sb.append(",");
                    for (int i = 0; i < parts.length-1; i++) {
                        sb.append(" "+parts[i]);
                        if (parts[i].length() == 1)
                            sb.append(".");
                    }
                }
                return sb.toString();
            }
        }

        return name;
    }
}