package org.jabref.model.entry;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
public class AuthorListParser {
private static final int TOKEN_GROUP_LENGTH = 4; // number of entries for a token
// the following are offsets of an entry in a group of entries for one token
private static final int OFFSET_TOKEN = 0; // String -- token itself;
private static final int OFFSET_TOKEN_ABBR = 1; // String -- token abbreviation;
private static final int OFFSET_TOKEN_TERM = 2; // Character -- token terminator (either " " or
// "-") comma)
// Token types (returned by getToken procedure)
private static final int TOKEN_EOF = 0;
private static final int TOKEN_AND = 1;
private static final int TOKEN_COMMA = 2;
private static final int TOKEN_WORD = 3;
// Constant HashSet containing names of TeX special characters
private static final Set<String> TEX_NAMES = new HashSet<>();
/** the raw bibtex author/editor field */
private String original;
/** index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2 */
private int tokenStart;
/** index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5 */
private int tokenEnd;
/** end of token abbreviation (always: tokenStart < tokenAbbr <= tokenEnd), only valid if getToken returns TOKEN_WORD */
private int tokenAbbr;
/** either space of dash */
private char tokenTerm;
/** true if upper-case token, false if lower-case */
private boolean tokenCase;
static {
TEX_NAMES.add("aa");
TEX_NAMES.add("ae");
TEX_NAMES.add("l");
TEX_NAMES.add("o");
TEX_NAMES.add("oe");
TEX_NAMES.add("i");
TEX_NAMES.add("AA");
TEX_NAMES.add("AE");
TEX_NAMES.add("L");
TEX_NAMES.add("O");
TEX_NAMES.add("OE");
TEX_NAMES.add("j");
}
/**
* Parses the String containing person names and returns a list of person information.
*
* @param listOfNames the String containing the person names to be parsed
* @return a parsed list of persons
*/
public AuthorList parse(String listOfNames) {
Objects.requireNonNull(listOfNames);
// initialization of parser
original = listOfNames;
tokenStart = 0;
tokenEnd = 0;
// Parse author by author
List<Author> authors = new ArrayList<>(5); // 5 seems to be reasonable initial size
while (tokenStart < original.length()) {
getAuthor().ifPresent(authors::add);
}
return new AuthorList(authors);
}
/**
* Parses one author name and returns preformatted information.
*
* @return Preformatted author name; <CODE>Optional.empty()</CODE> if author name is
* empty.
*/
private Optional<Author> getAuthor() {
List<Object> tokens = new ArrayList<>(); // initialization
int vonStart = -1;
int lastStart = -1;
int commaFirst = -1;
int commaSecond = -1;
// First step: collect tokens in 'tokens' Vector and calculate indices
boolean continueLoop = true;
while (continueLoop) {
int token = getToken();
switch (token) {
case TOKEN_EOF:
case TOKEN_AND:
continueLoop = false;
break;
case TOKEN_COMMA:
if (commaFirst < 0) {
commaFirst = tokens.size();
} else if (commaSecond < 0) {
commaSecond = tokens.size();
}
break;
case TOKEN_WORD:
tokens.add(original.substring(tokenStart, tokenEnd));
tokens.add(original.substring(tokenStart, tokenAbbr));
tokens.add(tokenTerm);
tokens.add(tokenCase);
if (commaFirst >= 0) {
break;
}
if (lastStart >= 0) {
break;
}
if (vonStart < 0) {
if (!tokenCase) {
int previousTermToken = (tokens.size() - TOKEN_GROUP_LENGTH - TOKEN_GROUP_LENGTH) + OFFSET_TOKEN_TERM;
if ((previousTermToken >= 0) && tokens.get(previousTermToken).equals('-')) {
// We are in a first name which contained a hyphen
break;
}
vonStart = tokens.size() - TOKEN_GROUP_LENGTH;
break;
}
} else if ((lastStart < 0) && tokenCase) {
lastStart = tokens.size() - TOKEN_GROUP_LENGTH;
break;
}
break;
default:
break;
}
}
// Second step: split name into parts (here: calculate indices
// of parts in 'tokens' Vector)
if (tokens.isEmpty()) {
return Optional.empty(); // no author information
}
// the following negatives indicate absence of the corresponding part
int firstPartStart = -1;
int vonPartStart = -1;
int lastPartStart = -1;
int jrPartStart = -1;
int firstPartEnd;
int vonPartEnd = 0;
int lastPartEnd = 0;
int jrPartEnd = 0;
if (commaFirst < 0) { // no commas
if (vonStart < 0) { // no 'von part'
lastPartEnd = tokens.size();
lastPartStart = tokens.size() - TOKEN_GROUP_LENGTH;
int index = (tokens.size() - (2 * TOKEN_GROUP_LENGTH)) + OFFSET_TOKEN_TERM;
if (index > 0) {
Character ch = (Character) tokens.get(index);
if (ch == '-') {
lastPartStart -= TOKEN_GROUP_LENGTH;
}
}
firstPartEnd = lastPartStart;
if (firstPartEnd > 0) {
firstPartStart = 0;
}
} else { // 'von part' is present
if (lastStart >= 0) {
lastPartEnd = tokens.size();
lastPartStart = lastStart;
vonPartEnd = lastPartStart;
} else {
vonPartEnd = tokens.size();
}
vonPartStart = vonStart;
firstPartEnd = vonPartStart;
if (firstPartEnd > 0) {
firstPartStart = 0;
}
}
} else { // commas are present: it affects only 'first part' and
// 'junior part'
firstPartEnd = tokens.size();
if (commaSecond < 0) { // one comma
if (commaFirst < firstPartEnd) {
firstPartStart = commaFirst;
}
} else { // two or more commas
if (commaSecond < firstPartEnd) {
firstPartStart = commaSecond;
}
jrPartEnd = commaSecond;
if (commaFirst < jrPartEnd) {
jrPartStart = commaFirst;
}
}
if (vonStart == 0) { // 'von part' is present
if (lastStart < 0) {
vonPartEnd = commaFirst;
} else {
lastPartEnd = commaFirst;
lastPartStart = lastStart;
vonPartEnd = lastPartStart;
}
vonPartStart = 0;
} else { // no 'von part'
lastPartEnd = commaFirst;
if (lastPartEnd > 0) {
lastPartStart = 0;
}
}
}
if ((firstPartStart == -1) && (lastPartStart == -1) && (vonPartStart != -1)) {
// There is no first or last name, but we have a von part. This is likely
// to indicate a single-entry name without an initial capital letter, such
// as "unknown".
// We make the von part the last name, to facilitate handling by last-name formatters:
lastPartStart = vonPartStart;
lastPartEnd = vonPartEnd;
vonPartStart = -1;
vonPartEnd = -1;
}
// Third step: do actual splitting, construct Author object
String firstPart = firstPartStart < 0 ? null : concatTokens(tokens, firstPartStart, firstPartEnd, OFFSET_TOKEN,
false);
String firstAbbr = firstPartStart < 0 ? null : concatTokens(tokens, firstPartStart, firstPartEnd,
OFFSET_TOKEN_ABBR, true);
String vonPart = vonPartStart < 0 ? null : concatTokens(tokens, vonPartStart, vonPartEnd, OFFSET_TOKEN, false);
String lastPart = lastPartStart < 0 ? null : concatTokens(tokens, lastPartStart, lastPartEnd, OFFSET_TOKEN,
false);
String jrPart = jrPartStart < 0 ? null : concatTokens(tokens, jrPartStart, jrPartEnd, OFFSET_TOKEN, false);
if ((firstPart != null) && (lastPart != null) && lastPart.equals(lastPart.toUpperCase(Locale.ROOT)) && (lastPart.length() < 5)) {
// The last part is a small string in complete upper case, so interpret it as initial of the first name
// This is the case for example in "Smith SH" which we think of as lastname=Smith and firstname=SH
// The length < 5 constraint should allow for "Smith S.H." as input
return Optional.of(new Author(lastPart, lastPart, vonPart, firstPart, jrPart));
} else {
return Optional.of(new Author(firstPart, firstAbbr, vonPart, lastPart, jrPart));
}
}
/**
* Concatenates list of tokens from 'tokens' Vector. Tokens are separated by
* spaces or dashes, depending on stored in 'tokens'. Callers always ensure
* that start < end; thus, there exists at least one token to be
* concatenated.
*
* @param start index of the first token to be concatenated in 'tokens' Vector
* (always divisible by TOKEN_GROUP_LENGTH).
* @param end index of the first token not to be concatenated in 'tokens'
* Vector (always divisible by TOKEN_GROUP_LENGTH).
* @param offset offset within token group (used to request concatenation of
* either full tokens or abbreviation).
* @param dotAfter <CODE>true</CODE> -- add period after each token, <CODE>false</CODE> --
* do not add.
* @return the result of concatenation.
*/
private String concatTokens(List<Object> tokens, int start, int end, int offset, boolean dotAfter) {
StringBuilder result = new StringBuilder();
// Here we always have start < end
result.append((String) tokens.get(start + offset));
if (dotAfter) {
result.append('.');
}
int updatedStart = start + TOKEN_GROUP_LENGTH;
while (updatedStart < end) {
result.append(tokens.get((updatedStart - TOKEN_GROUP_LENGTH) + OFFSET_TOKEN_TERM));
result.append((String) tokens.get(updatedStart + offset));
if (dotAfter) {
result.append('.');
}
updatedStart += TOKEN_GROUP_LENGTH;
}
return result.toString();
}
/**
* Parses the next token.
* <p>
* The string being parsed is stored in global variable <CODE>orig</CODE>,
* and position which parsing has to start from is stored in global variable
* <CODE>token_end</CODE>; thus, <CODE>token_end</CODE> has to be set
* to 0 before the first invocation. Procedure updates <CODE>token_end</CODE>;
* thus, subsequent invocations do not require any additional variable
* settings.
* <p>
* The type of the token is returned; if it is <CODE>TOKEN_WORD</CODE>,
* additional information is given in global variables <CODE>token_start</CODE>,
* <CODE>token_end</CODE>, <CODE>token_abbr</CODE>, <CODE>token_term</CODE>,
* and <CODE>token_case</CODE>; namely: <CODE>orig.substring(token_start,token_end)</CODE>
* is the text of the token, <CODE>orig.substring(token_start,token_abbr)</CODE>
* is the token abbreviation, <CODE>token_term</CODE> contains token
* terminator (space or dash), and <CODE>token_case</CODE> is <CODE>true</CODE>,
* if token is upper-case and <CODE>false</CODE> if token is lower-case.
*
* @return <CODE>TOKEN_EOF</CODE> -- no more tokens, <CODE>TOKEN_COMMA</CODE> --
* token is comma, <CODE>TOKEN_AND</CODE> -- token is the word
* "and" (or "And", or "aND", etc.) or a semicolon, <CODE>TOKEN_WORD</CODE> --
* token is a word; additional information is given in global
* variables <CODE>token_start</CODE>, <CODE>token_end</CODE>,
* <CODE>token_abbr</CODE>, <CODE>token_term</CODE>, and
* <CODE>token_case</CODE>.
*/
private int getToken() {
tokenStart = tokenEnd;
while (tokenStart < original.length()) {
char c = original.charAt(tokenStart);
if (!((c == '~') || (c == '-') || Character.isWhitespace(c))) {
break;
}
tokenStart++;
}
tokenEnd = tokenStart;
if (tokenStart >= original.length()) {
return TOKEN_EOF;
}
if (original.charAt(tokenStart) == ',') {
tokenEnd++;
return TOKEN_COMMA;
}
// Semicolon is considered to separate names like "and"
if (original.charAt(tokenStart) == ';') {
tokenEnd++;
return TOKEN_AND;
}
tokenAbbr = -1;
tokenTerm = ' ';
tokenCase = true;
int bracesLevel = 0;
int currentBackslash = -1;
boolean firstLetterIsFound = false;
while (tokenEnd < original.length()) {
char c = original.charAt(tokenEnd);
if (c == '{') {
bracesLevel++;
}
if (firstLetterIsFound && (tokenAbbr < 0) && ((bracesLevel == 0) || (c == '{'))) {
tokenAbbr = tokenEnd;
}
if ((c == '}') && (bracesLevel > 0)) {
bracesLevel--;
}
if (!firstLetterIsFound && (currentBackslash < 0) && Character.isLetter(c)) {
if (bracesLevel == 0) {
tokenCase = Character.isUpperCase(c);
} else {
// If this is a particle in braces, always treat it as if it starts with
// an upper case letter. Otherwise a name such as "{van den Bergen}, Hans"
// will not yield a proper last name:
tokenCase = true;
}
firstLetterIsFound = true;
}
if ((currentBackslash >= 0) && !Character.isLetter(c)) {
if (!firstLetterIsFound) {
String texCmdName = original.substring(currentBackslash + 1, tokenEnd);
if (TEX_NAMES.contains(texCmdName)) {
tokenCase = Character.isUpperCase(texCmdName.charAt(0));
firstLetterIsFound = true;
}
}
currentBackslash = -1;
}
if (c == '\\') {
currentBackslash = tokenEnd;
}
if ((bracesLevel == 0) && ((",;~-".indexOf(c) != -1) || Character.isWhitespace(c))) {
break;
}
tokenEnd++;
}
if (tokenAbbr < 0) {
tokenAbbr = tokenEnd;
}
if ((tokenEnd < original.length()) && (original.charAt(tokenEnd) == '-')) {
tokenTerm = '-';
}
if ("and".equalsIgnoreCase(original.substring(tokenStart, tokenEnd))) {
return TOKEN_AND;
} else {
return TOKEN_WORD;
}
}
}