package org.jabref.model.entry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.WeakHashMap;
import java.util.stream.Collectors;
/**
* This is an immutable class representing information of either <CODE>author</CODE>
* or <CODE>editor</CODE> field in bibtex record.
* <p>
* Constructor performs parsing of raw field text and stores preformatted data.
* Various accessor methods return author/editor field in different formats.
* <p>
* Parsing algorithm is designed to satisfy two requirements: (a) when author's
* name is typed correctly, the result should coincide with the one of BiBTeX;
* (b) for erroneous names, output should be reasonable (but may differ from
* BiBTeX output). The following rules are used:
* <ol>
* <li> 'author field' is a sequence of tokens;
* <ul>
* <li> tokens are separated by sequences of whitespaces (<CODE>Character.isWhitespace(c)==true</CODE>),
* commas (,), dashes (-), and tildas (~);
* <li> every comma separates tokens, while sequences of other separators are
* equivalent to a single separator; for example: "a - b" consists of 2 tokens
* ("a" and "b"), while "a,-,b" consists of 3 tokens ("a", "", and "b")
* <li> anything enclosed in braces belongs to a single token; for example:
* "abc x{a,b,-~ c}x" consists of 2 tokens, while "abc xa,b,-~ cx" consists of 4
* tokens ("abc", "xa","b", and "cx");
* <li> a token followed immediately by a dash is "dash-terminated" token, and
* all other tokens are "space-terminated" tokens; for example: in "a-b- c - d"
* tokens "a" and "b" are dash-terminated and "c" and "d" are space-terminated;
* <li> for the purposes of splitting of 'author name' into parts and
* construction of abbreviation of first name, one needs definitions of first
* letter of a token, case of a token, and abbreviation of a token:
* <ul>
* <li> 'first letter' of a token is the first letter character (<CODE>Character.isLetter(c)==true</CODE>)
* that does not belong to a sequence of letters that immediately follows "\"
* character, with one exception: if "\" is followed by "aa", "AA", "ae", "AE",
* "l", "L", "o", "O", "oe", "OE", "i", or "j" followed by non-letter, the
* 'first letter' of a token is a letter that follows "\"; for example: in
* "a{x}b" 'first letter' is "a", in "{\"{U}}bel" 'first letter' is "U", in
* "{\noopsort{\"o}}xyz" 'first letter' is "o", in "{\AE}x" 'first letter' is
* "A", in "\aex\ijk\Oe\j" 'first letter' is "j"; if there is no letter
* satisfying the above rule, 'first letter' is undefined;
* <li> token is "lower-case" token if its first letter is defined and is
* lower-case (<CODE>Character.isLowerCase(c)==true</CODE>), and token is
* "upper-case" token otherwise;
* <li> 'abbreviation' of a token is the shortest prefix of the token that (a)
* contains 'first letter' and (b) is braces-balanced; if 'first letter' is
* undefined, 'abbreviation' is the token itself; in the above examples,
* 'abbreviation's are "a", "{\"{U}}", "{\noopsort{\"o}}", "{\AE}",
* "\aex\ijk\Oe\j";
* </ul>
* <li> the behavior based on the above definitions will be erroneous only in
* one case: if the first-name-token is "{\noopsort{A}}john", we abbreviate it
* as "{\noopsort{A}}.", while BiBTeX produces "j."; fixing this problem,
* however, requires processing of the preabmle;
* </ul>
* <li> 'author names' in 'author field' are subsequences of tokens separated by
* token "and" ("and" is case-insensitive); if 'author name' is an empty
* sequence of tokens, it is ignored; for examle, both "John Smith and Peter
* Black" and "and and John Smith and and Peter Black" consists of 2 'author
* name's "Johm Smith" and "Peter Black" (in erroneous situations, this is a bit
* different from BiBTeX behavior);
* <li> 'author name' consists of 'first-part', 'von-part', 'last-part', and
* 'junior-part', each of which is a sequence of tokens; how a sequence of
* tokens has to be split into these parts, depends the number of commas:
* <ul>
* <li> no commas, all tokens are upper-case: 'junior-part' and 'von-part' are
* empty, 'last-part' consist of the last token, 'first-part' consists of all
* other tokens ('first-part' is empty if 'author name' consists of a single
* token); for example, in "John James Smith", 'last-part'="Smith" and
* 'first-part'="John James";
* <li> no commas, there exists lower-case token: 'junior-part' is empty,
* 'first-part' consists of all upper-case tokens before the first lower-case
* token, 'von-part' consists of lower-case tokens starting the first lower-case
* token and ending the lower-case token that is followed by upper-case token,
* 'last-part' consists of the rest of tokens; note that both 'first-part' and
* 'latst-part' may be empty and 'last-part' may contain lower-case tokens; for
* example: in "von der", 'first-part'='last-part'="", 'von-part'="von der"; in
* "Charles Louis Xavier Joseph de la Vall{\'e}e la Poussin",
* 'first-part'="Charles Louis Xavier Joseph", 'von-part'="de la",
* 'last-part'="Vall{\'e}e la Poussin";
* <li> one comma: 'junior-part' is empty, 'first-part' consists of all tokens
* after comma, 'von-part' consists of the longest sequence of lower-case tokens
* in the very beginning, 'last-part' consists of all tokens after 'von-part'
* and before comma; note that any part can be empty; for example: in "de la
* Vall{\'e}e la Poussin, Charles Louis Xavier Joseph", 'first-part'="Charles
* Louis Xavier Joseph", 'von-part'="de la", 'last-part'="Vall{\'e}e la
* Poussin"; in "Joseph de la Vall{\'e}e la Poussin, Charles Louis Xavier",
* 'first-part'="Charles Louis Xavier", 'von-part'="", 'last-part'="Joseph de la
* Vall{\'e}e la Poussin";
* <li> two or more commas (any comma after the second one is ignored; it merely
* separates tokens): 'junior-part' consists of all tokens between first and
* second commas, 'first-part' consists of all tokens after the second comma,
* tokens before the first comma are split into 'von-part' and 'last-part'
* similarly to the case of one comma; for example: in "de la Vall{\'e}e
* Poussin, Jr., Charles Louis Xavier Joseph", 'first-part'="Charles Louis
* Xavier Joseph", 'von-part'="de la", 'last-part'="Vall{\'e}e la Poussin", and
* 'junior-part'="Jr.";
* </ul>
* <li> when 'first-part', 'last-part', 'von-part', or 'junior-part' is
* reconstructed from tokens, tokens in a part are separated either by space or
* by dash, depending on whether the token before the separator was
* space-terminated or dash-terminated; for the last token in a part it does not
* matter whether it was dash- or space-terminated;
* <li> when 'first-part' is abbreviated, each token is replaced by its
* abbreviation followed by a period; separators are the same as in the case of
* non-abbreviated name; for example: in "Heinrich-{\"{U}}bel Kurt von Minich",
* 'first-part'="Heinrich-{\"{U}}bel Kurt", and its abbreviation is "H.-{\"{U}}.
* K."
* </ol>
*/
public class AuthorList {
private static final WeakHashMap<String, AuthorList> AUTHOR_CACHE = new WeakHashMap<>();
// Avoid partition where these values are contained
private final static Collection<String> avoidTermsInLowerCase = Arrays.asList("jr", "sr", "jnr", "snr", "von", "zu", "van", "der");
private final List<Author> authors;
private final String[] authorsFirstFirst = new String[4];
private final String[] authorsLastOnly = new String[2];
private final String[] authorLastFirstAnds = new String[2];
private final String[] authorsLastFirst = new String[4];
private final String[] authorsLastFirstFirstLast = new String[2];
// Variables for storing computed strings, so they only need to be created once:
private String authorsNatbib;
private String authorsFirstFirstAnds;
private String authorsAlph;
/**
* Creates a new list of authors.
* <p>
* Don't call this constructor directly but rather use the getAuthorList()
* method which caches its results.
*
* @param authors the list of authors which should underlie this instance
*/
protected AuthorList(List<Author> authors) {
this.authors = Objects.requireNonNull(authors);
}
protected AuthorList(Author author) {
this(Collections.singletonList(author));
}
public AuthorList() {
this(new ArrayList<Author>());
}
/**
* Retrieve an AuthorList for the given string of authors or editors.
* <p>
* This function tries to cache the parsed AuthorLists by the string passed in.
*
* @param authors The string of authors or editors in bibtex format to parse.
* @return An AuthorList object representing the given authors.
*/
public static AuthorList parse(String authors) {
Objects.requireNonNull(authors);
// Handle case names in order lastname, firstname and separated by ","
// E.g., Ali Babar, M., Dingsøyr, T., Lago, P., van der Vliet, H.
final boolean authorsContainAND = authors.toUpperCase(Locale.ENGLISH).contains(" AND ");
final boolean authorsContainOpeningBrace = authors.contains("{");
final boolean authorsContainSemicolon = authors.contains(";");
final boolean authorsContainTwoOrMoreCommas = (authors.length() - authors.replace(",", "").length()) >= 2;
if (!authorsContainAND && !authorsContainOpeningBrace && !authorsContainSemicolon && authorsContainTwoOrMoreCommas) {
List<String> arrayNameList = Arrays.asList(authors.split(","));
// Delete spaces for correct case identification
arrayNameList.replaceAll(String::trim);
// Looking for space between pre- and lastname
boolean spaceInAllParts = arrayNameList.stream().filter(name -> name.contains(" ")).collect(Collectors
.toList()).size() == arrayNameList.size();
// We hit the comma name separator case
// Usually the getAsLastFirstNamesWithAnd method would separate them if pre- and lastname are separated with "and"
// If not, we check if spaces separate pre- and lastname
if (spaceInAllParts) {
authors = authors.replaceAll(",", " and");
} else {
// Looking for name affixes to avoid
// arrayNameList needs to reduce by the count off avoiding terms
// valuePartsCount holds the count of name parts without the avoided terms
int valuePartsCount = arrayNameList.size();
// Holds the index of each term which needs to be avoided
Collection<Integer> avoidIndex = new HashSet<>();
for (int i = 0; i < arrayNameList.size(); i++) {
if (avoidTermsInLowerCase.contains(arrayNameList.get(i).toLowerCase(Locale.ROOT))) {
avoidIndex.add(i);
valuePartsCount--;
}
}
if ((valuePartsCount % 2) == 0) {
// We hit the described special case with name affix like Jr
authors = buildWithAffix(avoidIndex, arrayNameList).toString();
}
}
}
AuthorList authorList = AUTHOR_CACHE.get(authors);
if (authorList == null) {
AuthorListParser parser = new AuthorListParser();
authorList = parser.parse(authors);
AUTHOR_CACHE.put(authors, authorList);
}
return authorList;
}
/**
* This is a convenience method for getAuthorsFirstFirst()
*
* @see AuthorList#getAsFirstLastNames
*/
public static String fixAuthorFirstNameFirstCommas(String authors, boolean abbr, boolean oxfordComma) {
return AuthorList.parse(authors).getAsFirstLastNames(abbr, oxfordComma);
}
/**
* This is a convenience method for getAuthorsFirstFirstAnds()
*
* @see AuthorList#getAsFirstLastNamesWithAnd
*/
public static String fixAuthorFirstNameFirst(String authors) {
return AuthorList.parse(authors).getAsFirstLastNamesWithAnd();
}
/**
* This is a convenience method for getAuthorsLastFirst()
*
* @see AuthorList#getAsLastFirstNames
*/
public static String fixAuthorLastNameFirstCommas(String authors, boolean abbr, boolean oxfordComma) {
return AuthorList.parse(authors).getAsLastFirstNames(abbr, oxfordComma);
}
/**
* This is a convenience method for getAuthorsLastFirstAnds(true)
*
* @see AuthorList#getAsLastFirstNamesWithAnd
*/
public static String fixAuthorLastNameFirst(String authors) {
return AuthorList.parse(authors).getAsLastFirstNamesWithAnd(false);
}
/**
* This is a convenience method for getAuthorsLastFirstAnds()
*
* @see AuthorList#getAsLastFirstNamesWithAnd
*/
public static String fixAuthorLastNameFirst(String authors, boolean abbreviate) {
return AuthorList.parse(authors).getAsLastFirstNamesWithAnd(abbreviate);
}
/**
* This is a convenience method for getAuthorsLastOnly()
*
* @see AuthorList#getAsLastNames
*/
public static String fixAuthorLastNameOnlyCommas(String authors, boolean oxfordComma) {
return AuthorList.parse(authors).getAsLastNames(oxfordComma);
}
/**
* This is a convenience method for getAuthorsForAlphabetization()
*
* @see AuthorList#getForAlphabetization
*/
public static String fixAuthorForAlphabetization(String authors) {
return AuthorList.parse(authors).getForAlphabetization();
}
/**
* This is a convenience method for getAuthorsNatbib()
*
* @see AuthorList#getAsNatbib
*/
public static String fixAuthorNatbib(String authors) {
return AuthorList.parse(authors).getAsNatbib();
}
/**
* Builds a new array of strings with stringbuilder.
* Regarding to the name affixes.
*
* @return New string with correct seperation
*/
private static StringBuilder buildWithAffix(Collection<Integer> indexArray, List<String> nameList) {
StringBuilder stringBuilder = new StringBuilder();
// avoidedTimes needs to be increased by the count of avoided terms for correct odd/even calculation
int avoidedTimes = 0;
for (int i = 0; i < nameList.size(); i++) {
if (indexArray.contains(i)) {
// We hit a name affix
stringBuilder.append(nameList.get(i));
stringBuilder.append(',');
avoidedTimes++;
} else {
stringBuilder.append(nameList.get(i));
if (((i + avoidedTimes) % 2) == 0) {
// Hit separation between last name and firstname --> comma has to be kept
stringBuilder.append(',');
} else {
// Hit separation between full names (e.g., Ali Babar, M. and Dingsøyr, T.) --> semicolon has to be used
// Will be treated correctly by AuthorList.parse(authors);
stringBuilder.append(';');
}
}
}
return stringBuilder;
}
/**
* Returns the number of author names in this object.
*
* @return the number of author names in this object.
*/
public int getNumberOfAuthors() {
return authors.size();
}
/**
* Returns true if there are no authors in the list.
*
* @return true if there are no authors in the list.
*/
public boolean isEmpty() {
return authors.isEmpty();
}
/**
* Returns the <CODE>Author</CODE> object for the i-th author.
*
* @param i Index of the author (from 0 to <CODE>size()-1</CODE>).
* @return the <CODE>Author</CODE> object.
*/
public Author getAuthor(int i) {
return authors.get(i);
}
/**
* Returns the a list of <CODE>Author</CODE> objects.
*
* @return the <CODE>List<Author></CODE> object.
*/
public List<Author> getAuthors() {
return authors;
}
/**
* Returns the list of authors in "natbib" format.
* <p>
* <ul>
* <li>"John Smith" -> "Smith"</li>
* <li>"John Smith and Black Brown, Peter" ==> "Smith and Black Brown"</li>
* <li>"John von Neumann and John Smith and Black Brown, Peter" ==> "von
* Neumann et al." </li>
* </ul>
*
* @return formatted list of authors.
*/
public String getAsNatbib() {
// Check if we've computed this before:
if (authorsNatbib != null) {
return authorsNatbib;
}
StringBuilder res = new StringBuilder();
if (!isEmpty()) {
res.append(getAuthor(0).getLastOnly());
if (getNumberOfAuthors() == 2) {
res.append(" and ");
res.append(getAuthor(1).getLastOnly());
} else if (getNumberOfAuthors() > 2) {
res.append(" et al.");
}
}
authorsNatbib = res.toString();
return authorsNatbib;
}
/**
* Returns the list of authors separated by commas with last name only; If
* the list consists of two or more authors, "and" is inserted before the
* last author's name.
* <p>
* <p>
* <ul>
* <li> "John Smith" ==> "Smith"</li>
* <li> "John Smith and Black Brown, Peter" ==> "Smith and Black Brown"</li>
* <li> "John von Neumann and John Smith and Black Brown, Peter" ==> "von
* Neumann, Smith and Black Brown".</li>
* </ul>
*
* @param oxfordComma Whether to put a comma before the and at the end.
* @return formatted list of authors.
* @see <a href="http://en.wikipedia.org/wiki/Serial_comma">serial comma for an detailed explaination about the Oxford comma.</a>
*/
public String getAsLastNames(boolean oxfordComma) {
int abbrInt = oxfordComma ? 0 : 1;
// Check if we've computed this before:
if (authorsLastOnly[abbrInt] != null) {
return authorsLastOnly[abbrInt];
}
StringBuilder result = new StringBuilder();
if (!isEmpty()) {
result.append(getAuthor(0).getLastOnly());
int i = 1;
while (i < (getNumberOfAuthors() - 1)) {
result.append(", ");
result.append(getAuthor(i).getLastOnly());
i++;
}
if ((getNumberOfAuthors() > 2) && oxfordComma) {
result.append(',');
}
if (getNumberOfAuthors() > 1) {
result.append(" and ");
result.append(getAuthor(i).getLastOnly());
}
}
authorsLastOnly[abbrInt] = result.toString();
return authorsLastOnly[abbrInt];
}
/**
* Returns the list of authors separated by commas with first names after
* last name; first names are abbreviated or not depending on parameter. If
* the list consists of three or more authors, "and" is inserted before the
* last author's name.
* <p>
* <p>
* <ul>
* <li> "John Smith" ==> "Smith, John" or "Smith, J."</li>
* <li> "John Smith and Black Brown, Peter" ==> "Smith, John and Black
* Brown, Peter" or "Smith, J. and Black Brown, P."</li>
* <li> "John von Neumann and John Smith and Black Brown, Peter" ==> "von
* Neumann, John, Smith, John and Black Brown, Peter" or "von Neumann, J.,
* Smith, J. and Black Brown, P.".</li>
* </ul>
*
* @param abbreviate whether to abbreivate first names.
* @param oxfordComma Whether to put a comma before the and at the end.
* @return formatted list of authors.
* @see <a href="http://en.wikipedia.org/wiki/Serial_comma">serial comma for an detailed explaination about the Oxford comma.</a>
*/
public String getAsLastFirstNames(boolean abbreviate, boolean oxfordComma) {
int abbrInt = abbreviate ? 0 : 1;
abbrInt += oxfordComma ? 0 : 2;
// Check if we've computed this before:
if (authorsLastFirst[abbrInt] != null) {
return authorsLastFirst[abbrInt];
}
StringBuilder result = new StringBuilder();
if (!isEmpty()) {
result.append(getAuthor(0).getLastFirst(abbreviate));
int i = 1;
while (i < (getNumberOfAuthors() - 1)) {
result.append(", ");
result.append(getAuthor(i).getLastFirst(abbreviate));
i++;
}
if ((getNumberOfAuthors() > 2) && oxfordComma) {
result.append(',');
}
if (getNumberOfAuthors() > 1) {
result.append(" and ");
result.append(getAuthor(i).getLastFirst(abbreviate));
}
}
authorsLastFirst[abbrInt] = result.toString();
return authorsLastFirst[abbrInt];
}
@Override
public String toString() {
return authors.toString();
}
/**
* Returns the list of authors separated by "and"s with first names after
* last name; first names are not abbreviated.
* <p>
* <ul>
* <li>"John Smith" ==> "Smith, John"</li>
* <li>"John Smith and Black Brown, Peter" ==> "Smith, John and Black
* Brown, Peter"</li>
* <li>"John von Neumann and John Smith and Black Brown, Peter" ==> "von
* Neumann, John and Smith, John and Black Brown, Peter".</li>
* </ul>
*
* @return formatted list of authors.
*/
public String getAsLastFirstNamesWithAnd(boolean abbreviate) {
int abbrInt = abbreviate ? 0 : 1;
// Check if we've computed this before:
if (authorLastFirstAnds[abbrInt] != null) {
return authorLastFirstAnds[abbrInt];
}
authorLastFirstAnds[abbrInt] = getAuthors().stream().map(author -> author.getLastFirst(abbreviate))
.collect(Collectors.joining(" and "));
return authorLastFirstAnds[abbrInt];
}
public String getAsLastFirstFirstLastNamesWithAnd(boolean abbreviate) {
int abbrInt = abbreviate ? 0 : 1;
// Check if we've computed this before:
if (authorsLastFirstFirstLast[abbrInt] != null) {
return authorsLastFirstFirstLast[abbrInt];
}
StringBuilder result = new StringBuilder();
if (!isEmpty()) {
result.append(getAuthor(0).getLastFirst(abbreviate));
for (int i = 1; i < getNumberOfAuthors(); i++) {
result.append(" and ");
result.append(getAuthor(i).getFirstLast(abbreviate));
}
}
authorsLastFirstFirstLast[abbrInt] = result.toString();
return authorsLastFirstFirstLast[abbrInt];
}
/**
* Returns the list of authors separated by commas with first names before
* last name; first names are abbreviated or not depending on parameter. If
* the list consists of three or more authors, "and" is inserted before the
* last author's name.
* <p>
* <ul>
* <li>"John Smith" ==> "John Smith" or "J. Smith"</li>
* <li>"John Smith and Black Brown, Peter" ==> "John Smith and Peter Black
* Brown" or "J. Smith and P. Black Brown"</li>
* <li> "John von Neumann and John Smith and Black Brown, Peter" ==> "John
* von Neumann, John Smith and Peter Black Brown" or "J. von Neumann, J.
* Smith and P. Black Brown" </li>
* </ul>
*
* @param abbr whether to abbreivate first names.
* @param oxfordComma Whether to put a comma before the and at the end.
* @return formatted list of authors.
* @see <a href="http://en.wikipedia.org/wiki/Serial_comma">serial comma for an detailed explaination about the Oxford comma.</a>
*/
public String getAsFirstLastNames(boolean abbr, boolean oxfordComma) {
int abbrInt = abbr ? 0 : 1;
abbrInt += oxfordComma ? 0 : 2;
// Check if we've computed this before:
if (authorsFirstFirst[abbrInt] != null) {
return authorsFirstFirst[abbrInt];
}
StringBuilder result = new StringBuilder();
if (!isEmpty()) {
result.append(getAuthor(0).getFirstLast(abbr));
int i = 1;
while (i < (getNumberOfAuthors() - 1)) {
result.append(", ");
result.append(getAuthor(i).getFirstLast(abbr));
i++;
}
if ((getNumberOfAuthors() > 2) && oxfordComma) {
result.append(',');
}
if (getNumberOfAuthors() > 1) {
result.append(" and ");
result.append(getAuthor(i).getFirstLast(abbr));
}
}
authorsFirstFirst[abbrInt] = result.toString();
return authorsFirstFirst[abbrInt];
}
/**
* Compare this object with the given one.
* <p>
* Will return true iff the other object is an Author and all fields are identical on a string comparison.
*/
@Override
public boolean equals(Object o) {
if (!(o instanceof AuthorList)) {
return false;
}
AuthorList a = (AuthorList) o;
return this.authors.equals(a.authors);
}
@Override
public int hashCode() {
return Objects.hash(authors);
}
/**
* Returns the list of authors separated by "and"s with first names before
* last name; first names are not abbreviated.
* <p>
* <ul>
* <li>"John Smith" ==> "John Smith"</li>
* <li>"John Smith and Black Brown, Peter" ==> "John Smith and Peter Black
* Brown"</li>
* <li>"John von Neumann and John Smith and Black Brown, Peter" ==> "John
* von Neumann and John Smith and Peter Black Brown" </li>
* </li>
*
* @return formatted list of authors.
*/
public String getAsFirstLastNamesWithAnd() {
// Check if we've computed this before:
if (authorsFirstFirstAnds != null) {
return authorsFirstFirstAnds;
}
authorsFirstFirstAnds = getAuthors().stream().map(author -> author.getFirstLast(false))
.collect(Collectors.joining(" and "));
return authorsFirstFirstAnds;
}
/**
* Returns the list of authors in a form suitable for alphabetization. This
* means that last names come first, never preceded by "von" particles, and
* that any braces are removed. First names are abbreviated so the same name
* is treated similarly if abbreviated in one case and not in another. This
* form is not intended to be suitable for presentation, only for sorting.
* <p>
* <p>
* <ul>
* <li>"John Smith" ==> "Smith, J.";</li>
*
* @return formatted list of authors
*/
public String getForAlphabetization() {
if (authorsAlph != null) {
return authorsAlph;
}
authorsAlph = getAuthors().stream().map(Author::getNameForAlphabetization)
.collect(Collectors.joining(" and "));
return authorsAlph;
}
public void addAuthor(String first, String firstabbr, String von, String last, String jr) {
authors.add(new Author(first, firstabbr, von, last, jr));
}
}