/* Copyright (C) 2003-2011 JabRef contributors. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package net.sf.jabref.util; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Class containing method(s) for normalizing author lists to BibTeX format. */ public class NameListNormalizer { static Pattern lastFF = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) (\\p{javaUpperCase}+)"); static Pattern lastFdotF = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) ([\\. \\p{javaUpperCase}]+)"); static Pattern FFlast = Pattern.compile("(\\p{javaUpperCase}+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)"); static Pattern FdotFlast = Pattern.compile("([\\. \\p{javaUpperCase}]+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)"); static Pattern singleName = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]*)"); /*public static void main(String[] args) { normalizeAuthorList("Staci D. Bilbo and Smith SH and Jaclyn M Schwarz"); //System.out.println(normalizeAuthorList("Ølver MA")); //System.out.println(normalizeAuthorList("Ølver MA, GG Øie, Øie GG, Alfredsen JÅÅ, Jo Alfredsen, Olsen Y.Y. and Olsen Y. Y.")); //System.out.println(normalizeAuthorList("Ølver MA, GG Øie, Øie GG, Alfredsen JÅÅ, Jo Alfredsen, Olsen Y.Y., Olsen Y. Y.")); //System.out.println(normalizeAuthorList("Alver, Morten and Alver, Morten O and Alfredsen, JA and Olsen, Y.Y.")); //System.out.println(normalizeAuthorList("Alver, MA; Alfredsen, JA; Olsen Y.Y.")); }*/ public static String normalizeAuthorList(String in){ boolean andSep = false, semicolonSep = false, commaSep = false; String author; String[] authors = in.split("( |,)and ",-1); if (authors.length > 1) andSep = true; else { /* If there are no "and" separators in the original string, we assume it either means that the author list is comma or semicolon separated or that it contains only a single name. If there is a semicolon, we go by that. If not, we assume commas, and count the parts separated by commas to determine which it is. */ String[] a2 = in.split("; "); if (a2.length > 1) { semicolonSep = true; authors = a2; } else { a2 = in.split(", "); if (a2.length > 3) { // Probably more than a single author, so we split by commas. commaSep = true; authors = a2; } else { if (a2.length == 3) { // This could be a BibTeX formatted name containing a Jr particle, // e.g. Smith, Jr., Peter // We check if the middle part is <= 3 characters. If not, we assume we are // dealing with three authors. if (a2[1].length() > 3) authors = a2; } } } } // Remove leading and trailing whitespaces from each name: for (int i = 0; i < authors.length; i++){ authors[i] = authors[i].trim(); } // If we found an and separator, there could possibly be semicolon or // comma separation before the last separator. If there are two or more // and separators, we can dismiss this possibility. // If there is only a single and separator, check closer: if(andSep && (authors.length == 2)){ // Check if the first part is semicolon separated: String[] semiSep = authors[0].split("; "); if (semiSep.length > 1) { // Ok, it looks like this is the case. Use separation by semicolons: String[] newAuthors = new String[1+semiSep.length]; for (int i=0; i<semiSep.length; i++) { newAuthors[i] = semiSep[i].trim(); } newAuthors[semiSep.length] = authors[1]; authors = newAuthors; } else { // Check if there is a comma in the last name. If so, we can assume that comma // is not used to separate the names: boolean lnfn = (authors[1].indexOf(",") > 0); if (!lnfn) { String[] cmSep = authors[0].split(", "); if (cmSep.length > 1) { // This means that the last name doesn't contain a comma, but the first // one contains one or more. This indicates that the names leading up to // the single "and" are comma separated: String[] newAuthors = new String[1+cmSep.length]; for (int i=0; i<cmSep.length; i++) { newAuthors[i] = cmSep[i].trim(); } newAuthors[cmSep.length] = authors[1]; authors = newAuthors; } } } } StringBuilder sb = new StringBuilder(); for (int i=0; i<authors.length; i++) { String norm = normalizeName(authors[i]); sb.append(norm); if (i < authors.length-1) sb.append(" and "); } return sb.toString(); } public static String normalizeName(String name) { Matcher m = lastFF.matcher(name); if (m.matches()) { String initials = m.group(2); StringBuilder sb = new StringBuilder(m.group(1)); sb.append(", "); for (int i=0; i<initials.length(); i++) { sb.append(initials.charAt(i)); sb.append('.'); if (i < initials.length()-1) sb.append(' '); } return sb.toString(); } m = lastFdotF.matcher(name); if (m.matches()) { String initials = m.group(2).replaceAll("[\\. ]+", ""); StringBuilder sb = new StringBuilder(m.group(1)); sb.append(", "); for (int i=0; i<initials.length(); i++) { sb.append(initials.charAt(i)); sb.append('.'); if (i < initials.length()-1) sb.append(' '); } return sb.toString(); } m = FFlast.matcher(name); if (m.matches()) { String initials = m.group(1); StringBuilder sb = new StringBuilder(m.group(2)); sb.append(", "); for (int i=0; i<initials.length(); i++) { sb.append(initials.charAt(i)); sb.append('.'); if (i < initials.length()-1) sb.append(' '); } return sb.toString(); } m = FdotFlast.matcher(name); if (m.matches()) { String initials = m.group(1).replaceAll("[\\. ]+", ""); StringBuilder sb = new StringBuilder(m.group(2)); sb.append(", "); for (int i=0; i<initials.length(); i++) { sb.append(initials.charAt(i)); sb.append('.'); if (i < initials.length()-1) sb.append(' '); } return sb.toString(); } if (name.indexOf(',') >= 0) { // Name contains comma int index = name.lastIndexOf(','); // If the comma is at the end of the name, just remove it to prevent index error: if (index == name.length() - 1) name = name.substring(0, name.length()-1); StringBuilder sb = new StringBuilder(name.substring(0, index)); sb.append(", "); // Check if the remainder is a single name: String fName = name.substring(index+1).trim(); String[] fParts = fName.split(" "); if (fParts.length > 1) { // Multiple parts. Add all of them, and add a dot if they are single letter parts: for (int i=0; i<fParts.length; i++) { if (fParts[i].length() == 1) sb.append(fParts[i]+"."); else sb.append(fParts[i]); if (i < fParts.length-1) sb.append(" "); } } else { // Only a single part. Check if it looks like a name or initials: Matcher m2 = singleName.matcher(fParts[0]); if (m2.matches()) sb.append(fParts[0]); else { // It looks like initials. String initials = fParts[0].replaceAll("[\\.]+", ""); for (int i=0; i<initials.length(); i++) { sb.append(initials.charAt(i)); sb.append('.'); if (i < initials.length()-1) sb.append(' '); } } } return sb.toString(); } else { // Name doesn't contain comma String[] parts = name.split(" +"); boolean allNames = true; for (int i = 0; i < parts.length; i++) { m = singleName.matcher(parts[i]); if (!m.matches()) { allNames = false; break; } } if (allNames) { // Looks like a name written in full with first name first. // Change into last name first format: StringBuilder sb = new StringBuilder(parts[parts.length-1]); if (parts.length > 1) { sb.append(","); for (int i = 0; i < parts.length-1; i++) { sb.append(" "+parts[i]); if (parts[i].length() == 1) sb.append("."); } } return sb.toString(); } } return name; } }