AuthorListParser.java example

Explorer
jabref-master
- src
package org.jabref.model.entry;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;

public class AuthorListParser {

    private static final int TOKEN_GROUP_LENGTH = 4; // number of entries for a token

    // the following are offsets of an entry in a group of entries for one token
    private static final int OFFSET_TOKEN = 0; // String -- token itself;

    private static final int OFFSET_TOKEN_ABBR = 1; // String -- token abbreviation;

    private static final int OFFSET_TOKEN_TERM = 2; // Character -- token terminator (either " " or
    // "-") comma)

    // Token types (returned by getToken procedure)
    private static final int TOKEN_EOF = 0;

    private static final int TOKEN_AND = 1;

    private static final int TOKEN_COMMA = 2;

    private static final int TOKEN_WORD = 3;

    // Constant HashSet containing names of TeX special characters
    private static final Set<String> TEX_NAMES = new HashSet<>();

    /** the raw bibtex author/editor field */
    private String original;

    /** index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2 */
    private int tokenStart;

    /** index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5 */
    private int tokenEnd;

    /** end of token abbreviation (always: tokenStart < tokenAbbr <= tokenEnd), only valid if getToken returns TOKEN_WORD */
    private int tokenAbbr;


    /** either space of dash */
    private char tokenTerm;

    /** true if upper-case token, false if lower-case */
    private boolean tokenCase;

    static {
        TEX_NAMES.add("aa");
        TEX_NAMES.add("ae");
        TEX_NAMES.add("l");
        TEX_NAMES.add("o");
        TEX_NAMES.add("oe");
        TEX_NAMES.add("i");
        TEX_NAMES.add("AA");
        TEX_NAMES.add("AE");
        TEX_NAMES.add("L");
        TEX_NAMES.add("O");
        TEX_NAMES.add("OE");
        TEX_NAMES.add("j");
    }

    /**
     * Parses the String containing person names and returns a list of person information.
     *
     * @param listOfNames the String containing the person names to be parsed
     * @return a parsed list of persons
     */
    public AuthorList parse(String listOfNames) {

        Objects.requireNonNull(listOfNames);

        // initialization of parser
        original = listOfNames;
        tokenStart = 0;
        tokenEnd = 0;

        // Parse author by author
        List<Author> authors = new ArrayList<>(5); // 5 seems to be reasonable initial size
        while (tokenStart < original.length()) {
            getAuthor().ifPresent(authors::add);
        }
        return new AuthorList(authors);
    }

    /**
     * Parses one author name and returns preformatted information.
     *
     * @return Preformatted author name; <CODE>Optional.empty()</CODE> if author name is
     * empty.
     */
    private Optional<Author> getAuthor() {

        List<Object> tokens = new ArrayList<>(); // initialization
        int vonStart = -1;
        int lastStart = -1;
        int commaFirst = -1;
        int commaSecond = -1;

        // First step: collect tokens in 'tokens' Vector and calculate indices
        boolean continueLoop = true;
        while (continueLoop) {
            int token = getToken();
            switch (token) {
            case TOKEN_EOF:
            case TOKEN_AND:
                continueLoop = false;
                break;
            case TOKEN_COMMA:
                if (commaFirst < 0) {
                    commaFirst = tokens.size();
                } else if (commaSecond < 0) {
                    commaSecond = tokens.size();
                }
                break;
            case TOKEN_WORD:
                tokens.add(original.substring(tokenStart, tokenEnd));
                tokens.add(original.substring(tokenStart, tokenAbbr));
                tokens.add(tokenTerm);
                tokens.add(tokenCase);
                if (commaFirst >= 0) {
                    break;
                }
                if (lastStart >= 0) {
                    break;
                }
                if (vonStart < 0) {
                    if (!tokenCase) {
                        int previousTermToken = (tokens.size() - TOKEN_GROUP_LENGTH - TOKEN_GROUP_LENGTH) + OFFSET_TOKEN_TERM;
                        if ((previousTermToken >= 0) && tokens.get(previousTermToken).equals('-')) {
                            // We are in a first name which contained a hyphen
                            break;
                        }
                        vonStart = tokens.size() - TOKEN_GROUP_LENGTH;
                        break;
                    }
                } else if ((lastStart < 0) && tokenCase) {
                    lastStart = tokens.size() - TOKEN_GROUP_LENGTH;
                    break;
                }
                break;
            default:
                break;
            }
        }

        // Second step: split name into parts (here: calculate indices
        // of parts in 'tokens' Vector)
        if (tokens.isEmpty()) {
            return Optional.empty(); // no author information
        }

        // the following negatives indicate absence of the corresponding part
        int firstPartStart = -1;
        int vonPartStart = -1;
        int lastPartStart = -1;
        int jrPartStart = -1;
        int firstPartEnd;
        int vonPartEnd = 0;
        int lastPartEnd = 0;
        int jrPartEnd = 0;
        if (commaFirst < 0) { // no commas
            if (vonStart < 0) { // no 'von part'
                lastPartEnd = tokens.size();
                lastPartStart = tokens.size() - TOKEN_GROUP_LENGTH;
                int index = (tokens.size() - (2 * TOKEN_GROUP_LENGTH)) + OFFSET_TOKEN_TERM;
                if (index > 0) {
                    Character ch = (Character) tokens.get(index);
                    if (ch == '-') {
                        lastPartStart -= TOKEN_GROUP_LENGTH;
                    }
                }
                firstPartEnd = lastPartStart;
                if (firstPartEnd > 0) {
                    firstPartStart = 0;
                }
            } else { // 'von part' is present
                if (lastStart >= 0) {
                    lastPartEnd = tokens.size();
                    lastPartStart = lastStart;
                    vonPartEnd = lastPartStart;
                } else {
                    vonPartEnd = tokens.size();
                }
                vonPartStart = vonStart;
                firstPartEnd = vonPartStart;
                if (firstPartEnd > 0) {
                    firstPartStart = 0;
                }
            }
        } else { // commas are present: it affects only 'first part' and
            // 'junior part'
            firstPartEnd = tokens.size();
            if (commaSecond < 0) { // one comma
                if (commaFirst < firstPartEnd) {
                    firstPartStart = commaFirst;
                }
            } else { // two or more commas
                if (commaSecond < firstPartEnd) {
                    firstPartStart = commaSecond;
                }
                jrPartEnd = commaSecond;
                if (commaFirst < jrPartEnd) {
                    jrPartStart = commaFirst;
                }
            }
            if (vonStart == 0) { // 'von part' is present
                if (lastStart < 0) {
                    vonPartEnd = commaFirst;
                } else {
                    lastPartEnd = commaFirst;
                    lastPartStart = lastStart;
                    vonPartEnd = lastPartStart;
                }
                vonPartStart = 0;
            } else { // no 'von part'
                lastPartEnd = commaFirst;
                if (lastPartEnd > 0) {
                    lastPartStart = 0;
                }
            }
        }

        if ((firstPartStart == -1) && (lastPartStart == -1) && (vonPartStart != -1)) {
            // There is no first or last name, but we have a von part. This is likely
            // to indicate a single-entry name without an initial capital letter, such
            // as "unknown".
            // We make the von part the last name, to facilitate handling by last-name formatters:
            lastPartStart = vonPartStart;
            lastPartEnd = vonPartEnd;
            vonPartStart = -1;
            vonPartEnd = -1;
        }

        // Third step: do actual splitting, construct Author object
        String firstPart = firstPartStart < 0 ? null : concatTokens(tokens, firstPartStart, firstPartEnd, OFFSET_TOKEN,
                false);
        String firstAbbr = firstPartStart < 0 ? null : concatTokens(tokens, firstPartStart, firstPartEnd,
                OFFSET_TOKEN_ABBR, true);
        String vonPart = vonPartStart < 0 ? null : concatTokens(tokens, vonPartStart, vonPartEnd, OFFSET_TOKEN, false);
        String lastPart = lastPartStart < 0 ? null : concatTokens(tokens, lastPartStart, lastPartEnd, OFFSET_TOKEN,
                false);
        String jrPart = jrPartStart < 0 ? null : concatTokens(tokens, jrPartStart, jrPartEnd, OFFSET_TOKEN, false);

        if ((firstPart != null) && (lastPart != null) && lastPart.equals(lastPart.toUpperCase(Locale.ROOT)) && (lastPart.length() < 5)) {
            // The last part is a small string in complete upper case, so interpret it as initial of the first name
            // This is the case for example in "Smith SH" which we think of as lastname=Smith and firstname=SH
            // The length < 5 constraint should allow for "Smith S.H." as input
            return Optional.of(new Author(lastPart, lastPart, vonPart, firstPart, jrPart));
        } else {
            return Optional.of(new Author(firstPart, firstAbbr, vonPart, lastPart, jrPart));
        }
    }

    /**
     * Concatenates list of tokens from 'tokens' Vector. Tokens are separated by
     * spaces or dashes, depending on stored in 'tokens'. Callers always ensure
     * that start < end; thus, there exists at least one token to be
     * concatenated.
     *
     * @param start     index of the first token to be concatenated in 'tokens' Vector
     *                  (always divisible by TOKEN_GROUP_LENGTH).
     * @param end       index of the first token not to be concatenated in 'tokens'
     *                  Vector (always divisible by TOKEN_GROUP_LENGTH).
     * @param offset    offset within token group (used to request concatenation of
     *                  either full tokens or abbreviation).
     * @param dotAfter <CODE>true</CODE> -- add period after each token, <CODE>false</CODE> --
     *                  do not add.
     * @return the result of concatenation.
     */
    private String concatTokens(List<Object> tokens, int start, int end, int offset, boolean dotAfter) {
        StringBuilder result = new StringBuilder();
        // Here we always have start < end
        result.append((String) tokens.get(start + offset));
        if (dotAfter) {
            result.append('.');
        }
        int updatedStart = start + TOKEN_GROUP_LENGTH;
        while (updatedStart < end) {
            result.append(tokens.get((updatedStart - TOKEN_GROUP_LENGTH) + OFFSET_TOKEN_TERM));
            result.append((String) tokens.get(updatedStart + offset));
            if (dotAfter) {
                result.append('.');
            }
            updatedStart += TOKEN_GROUP_LENGTH;
        }
        return result.toString();
    }

    /**
     * Parses the next token.
     * <p>
     * The string being parsed is stored in global variable <CODE>orig</CODE>,
     * and position which parsing has to start from is stored in global variable
     * <CODE>token_end</CODE>; thus, <CODE>token_end</CODE> has to be set
     * to 0 before the first invocation. Procedure updates <CODE>token_end</CODE>;
     * thus, subsequent invocations do not require any additional variable
     * settings.
     * <p>
     * The type of the token is returned; if it is <CODE>TOKEN_WORD</CODE>,
     * additional information is given in global variables <CODE>token_start</CODE>,
     * <CODE>token_end</CODE>, <CODE>token_abbr</CODE>, <CODE>token_term</CODE>,
     * and <CODE>token_case</CODE>; namely: <CODE>orig.substring(token_start,token_end)</CODE>
     * is the text of the token, <CODE>orig.substring(token_start,token_abbr)</CODE>
     * is the token abbreviation, <CODE>token_term</CODE> contains token
     * terminator (space or dash), and <CODE>token_case</CODE> is <CODE>true</CODE>,
     * if token is upper-case and <CODE>false</CODE> if token is lower-case.
     *
     * @return <CODE>TOKEN_EOF</CODE> -- no more tokens, <CODE>TOKEN_COMMA</CODE> --
     * token is comma, <CODE>TOKEN_AND</CODE> -- token is the word
     * "and" (or "And", or "aND", etc.) or a semicolon, <CODE>TOKEN_WORD</CODE> --
     * token is a word; additional information is given in global
     * variables <CODE>token_start</CODE>, <CODE>token_end</CODE>,
     * <CODE>token_abbr</CODE>, <CODE>token_term</CODE>, and
     * <CODE>token_case</CODE>.
     */
    private int getToken() {
        tokenStart = tokenEnd;
        while (tokenStart < original.length()) {
            char c = original.charAt(tokenStart);
            if (!((c == '~') || (c == '-') || Character.isWhitespace(c))) {
                break;
            }
            tokenStart++;
        }
        tokenEnd = tokenStart;
        if (tokenStart >= original.length()) {
            return TOKEN_EOF;
        }
        if (original.charAt(tokenStart) == ',') {
            tokenEnd++;
            return TOKEN_COMMA;
        }
        // Semicolon is considered to separate names like "and"
        if (original.charAt(tokenStart) == ';') {
            tokenEnd++;
            return TOKEN_AND;
        }
        tokenAbbr = -1;
        tokenTerm = ' ';
        tokenCase = true;
        int bracesLevel = 0;
        int currentBackslash = -1;
        boolean firstLetterIsFound = false;
        while (tokenEnd < original.length()) {
            char c = original.charAt(tokenEnd);
            if (c == '{') {
                bracesLevel++;
            }
            if (firstLetterIsFound && (tokenAbbr < 0) && ((bracesLevel == 0) || (c == '{'))) {
                tokenAbbr = tokenEnd;
            }
            if ((c == '}') && (bracesLevel > 0)) {
                bracesLevel--;
            }
            if (!firstLetterIsFound && (currentBackslash < 0) && Character.isLetter(c)) {
                if (bracesLevel == 0) {
                    tokenCase = Character.isUpperCase(c);
                } else {
                    // If this is a particle in braces, always treat it as if it starts with
                    // an upper case letter. Otherwise a name such as "{van den Bergen}, Hans"
                    // will not yield a proper last name:
                    tokenCase = true;
                }
                firstLetterIsFound = true;
            }
            if ((currentBackslash >= 0) && !Character.isLetter(c)) {
                if (!firstLetterIsFound) {
                    String texCmdName = original.substring(currentBackslash + 1, tokenEnd);
                    if (TEX_NAMES.contains(texCmdName)) {
                        tokenCase = Character.isUpperCase(texCmdName.charAt(0));
                        firstLetterIsFound = true;
                    }
                }
                currentBackslash = -1;
            }
            if (c == '\\') {
                currentBackslash = tokenEnd;
            }
            if ((bracesLevel == 0) && ((",;~-".indexOf(c) != -1) || Character.isWhitespace(c))) {
                break;
            }
            tokenEnd++;
        }
        if (tokenAbbr < 0) {
            tokenAbbr = tokenEnd;
        }
        if ((tokenEnd < original.length()) && (original.charAt(tokenEnd) == '-')) {
            tokenTerm = '-';
        }
        if ("and".equalsIgnoreCase(original.substring(tokenStart, tokenEnd))) {
            return TOKEN_AND;
        } else {
            return TOKEN_WORD;
        }
    }

}