PdfContentImporter.java example

Explorer
jabref-master
- src
package org.jabref.logic.importer.fileformat;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fetcher.DoiFetcher;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileExtensions;
import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
import org.jabref.logic.xmp.XMPUtil;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibtexEntryTypes;
import org.jabref.model.entry.EntryType;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.identifier.DOI;

import com.google.common.base.Strings;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry.
 * <p>
 * Currently, Springer and IEEE formats are supported.
 * <p>
 * Integrating XMP support is future work
 */
public class PdfContentImporter extends Importer {

    private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");
    private final ImportFormatPreferences importFormatPreferences;
    // input lines into several lines
    private String[] lines;
    // current index in lines
    private int i;
    private String curString;
    private String year;


    public PdfContentImporter(ImportFormatPreferences importFormatPreferences) {
        this.importFormatPreferences = importFormatPreferences;
    }
    /**
     * Removes all non-letter characters at the end
     * <p>
     * EXCEPTION: a closing bracket is NOT removed
     * </p>
     * <p>
     * TODO: Additionally replace multiple subsequent spaces by one space, which will cause a rename of this method
     * </p>
     */
    private static String removeNonLettersAtEnd(String input) {
        String result = input.trim();
        if (result.isEmpty()) {
            return result;
        }
        char lastC = result.charAt(result.length() - 1);
        while (!Character.isLetter(lastC) && (lastC != ')')) {
            // if there is an asterix, a dot or something else at the end: remove it
            result = result.substring(0, result.length() - 1);
            if (result.isEmpty()) {
                break;
            } else {
                lastC = result.charAt(result.length() - 1);
            }
        }
        return result;
    }

    private static String streamlineNames(String names) {
        // TODO: replace with NormalizeNamesFormatter?!
        String res;
        // supported formats:
        //   Matthias Schrepfer1, Johannes Wolf1, Jan Mendling1, and Hajo A. Reijers2
        if (names.contains(",")) {
            String[] splitNames = names.split(",");
            res = "";
            boolean isFirst = true;
            for (String splitName : splitNames) {
                String curName = removeNonLettersAtEnd(splitName);
                if (curName.indexOf("and") == 0) {
                    // skip possible ands between names
                    curName = curName.substring(3).trim();
                } else {
                    int posAnd = curName.indexOf(" and ");
                    if (posAnd >= 0) {
                        String nameBefore = curName.substring(0, posAnd);
                        // cannot be first name as "," is contained in the string
                        res = res.concat(" and ").concat(removeNonLettersAtEnd(nameBefore));
                        curName = curName.substring(posAnd + 5);
                    }
                }

                if (!"".equals(curName)) {
                    if ("et al.".equalsIgnoreCase(curName)) {
                        curName = "others";
                    }
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        res = res.concat(" and ");
                    }
                    res = res.concat(curName);
                }
            }
        } else {
            // assumption: names separated by space

            String[] splitNames = names.split(" ");
            if (splitNames.length == 0) {
                // empty names... something was really wrong...
                return "";
            }

            boolean workedOnFirstOrMiddle = false;
            boolean isFirst = true;
            int i = 0;
            res = "";
            do {
                if (workedOnFirstOrMiddle) {
                    // last item was a first or a middle name
                    // we have to check whether we are on a middle name
                    // if not, just add the item as last name and add an "and"
                    if (splitNames[i].contains(".")) {
                        // we found a middle name
                        res = res.concat(splitNames[i]).concat(" ");
                    } else {
                        // last name found
                        res = res.concat(removeNonLettersAtEnd(splitNames[i]));

                        if (!splitNames[i].isEmpty() && Character.isLowerCase(splitNames[i].charAt(0))) {
                            // it is probably be "van", "vom", ...
                            // we just rely on the fact that these things are written in lower case letters
                            // do NOT finish name
                            res = res.concat(" ");
                        } else {
                            // finish this name
                            workedOnFirstOrMiddle = false;
                        }
                    }
                } else {
                    if ("and".equalsIgnoreCase(splitNames[i])) {
                        // do nothing, just increment i at the end of this iteration
                    } else {
                        if (isFirst) {
                            isFirst = false;
                        } else {
                            res = res.concat(" and ");
                        }
                        if ("et".equalsIgnoreCase(splitNames[i]) && (splitNames.length > (i + 1))
                                && "al.".equalsIgnoreCase(splitNames[i + 1])) {
                            res = res.concat("others");
                            break;
                        } else {
                            res = res.concat(splitNames[i]).concat(" ");
                            workedOnFirstOrMiddle = true;
                        }
                    }
                }
                i++;
            } while (i < splitNames.length);
        }
        return res;
    }

    private static String streamlineTitle(String title) {
        return removeNonLettersAtEnd(title);
    }

    @Override
    public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
        Objects.requireNonNull(reader);
        return false;
    }

    @Override
    public ParserResult importDatabase(BufferedReader reader) throws IOException {
        Objects.requireNonNull(reader);
        throw new UnsupportedOperationException(
                "PdfContentImporter does not support importDatabase(BufferedReader reader)."
                        + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
    }

    @Override
    public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
        final ArrayList<BibEntry> result = new ArrayList<>(1);
        try (FileInputStream fileStream = new FileInputStream(filePath.toFile());
                PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) {
            String firstPageContents = getFirstPageContents(document);

            Optional<DOI> doi = DOI.findInText(firstPageContents);
            if (doi.isPresent()) {
                ParserResult parserResult = new ParserResult(result);
                Optional<BibEntry> entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI());
                entry.ifPresent(parserResult.getDatabase()::insertEntry);
                return parserResult;
            }

            // idea: split[] contains the different lines
            // blocks are separated by empty lines
            // treat each block
            //   or do special treatment at authors (which are not broken)
            //   therefore, we do a line-based and not a block-based splitting
            // i points to the current line
            // curString (mostly) contains the current block
            //   the different lines are joined into one and thereby separated by " "
            lines = firstPageContents.split(System.lineSeparator());

            proceedToNextNonEmptyLine();
            if (i >= lines.length) {
                // PDF could not be parsed or is empty
                // return empty list
                return new ParserResult();
            }

            // we start at the current line
            curString = lines[i];
            // i might get incremented later and curString modified, too
            i = i + 1;

            String author;
            String editor = null;
            String abstractT = null;
            String keywords = null;
            String title;
            String conference = null;
            String DOI = null;
            String series = null;
            String volume = null;
            String number = null;
            String pages = null;
            // year is a class variable as the method extractYear() uses it;
            String publisher = null;

            EntryType type = BibtexEntryTypes.INPROCEEDINGS;
            if (curString.length() > 4) {
                // special case: possibly conference as first line on the page
                extractYear();
                if (curString.contains("Conference")) {
                    fillCurStringWithNonEmptyLines();
                    conference = curString;
                    curString = "";
                } else {
                    // e.g. Copyright (c) 1998 by the Genetics Society of America
                    // future work: get year using RegEx
                    String lower = curString.toLowerCase(Locale.ROOT);
                    if (lower.contains("copyright")) {
                        fillCurStringWithNonEmptyLines();
                        publisher = curString;
                        curString = "";
                    }
                }
            }

            // start: title
            fillCurStringWithNonEmptyLines();
            title = streamlineTitle(curString);
            curString = "";
            //i points to the next non-empty line

            // after title: authors
            author = null;
            while ((i < lines.length) && !"".equals(lines[i])) {
                // author names are unlikely to be lines among different lines
                // treat them line by line
                curString = streamlineNames(lines[i]);
                if (author == null) {
                    author = curString;
                } else {
                    if ("".equals(curString)) {
                        // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
                    } else {
                        author = author.concat(" and ").concat(curString);
                    }
                }
                i++;
            }
            curString = "";
            i++;

            // then, abstract and keywords follow
            while (i < lines.length) {
                curString = lines[i];
                if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) {
                    if (curString.length() == "Abstract".length()) {
                        // only word "abstract" found -- skip line
                        curString = "";
                    } else {
                        curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator());
                    }
                    i++;
                    // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
                    // whereas we need linebreak as separator
                    while ((i < lines.length) && !"".equals(lines[i])) {
                        curString = curString.concat(lines[i]).concat(System.lineSeparator());
                        i++;
                    }
                    abstractT = curString.trim();
                    i++;
                } else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) {
                    if (curString.length() == "Keywords".length()) {
                        // only word "Keywords" found -- skip line
                        curString = "";
                    } else {
                        curString = curString.substring("Keywords".length() + 1).trim();
                    }
                    i++;
                    fillCurStringWithNonEmptyLines();
                    keywords = removeNonLettersAtEnd(curString);
                } else {
                    String lower = curString.toLowerCase(Locale.ROOT);

                    int pos = lower.indexOf("technical");
                    if (pos >= 0) {
                        type = BibtexEntryTypes.TECHREPORT;
                        pos = curString.trim().lastIndexOf(' ');
                        if (pos >= 0) {
                            // assumption: last character of curString is NOT ' '
                            //   otherwise pos+1 leads to an out-of-bounds exception
                            number = curString.substring(pos + 1);
                        }
                    }

                    i++;
                    proceedToNextNonEmptyLine();
                }
            }

            i = lines.length - 1;

            // last block: DOI, detailed information
            // sometimes, this information is in the third last block etc...
            // therefore, read until the beginning of the file

            while (i >= 0) {
                readLastBlock();
                // i now points to the block before or is -1
                // curString contains the last block, separated by " "

                extractYear();

                int pos = curString.indexOf("(Eds.)");
                if ((pos >= 0) && (publisher == null)) {
                    // looks like a Springer last line
                    // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
                    publisher = "Springer";
                    editor = streamlineNames(curString.substring(0, pos - 1));
                    curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space
                    String[] springerSplit = curString.split(", ");
                    if (springerSplit.length >= 4) {
                        conference = springerSplit[0];

                        String seriesData = springerSplit[1];
                        int lastSpace = seriesData.lastIndexOf(' ');
                        series = seriesData.substring(0, lastSpace);
                        volume = seriesData.substring(lastSpace + 1);

                        pages = springerSplit[2].substring(4);

                        if (springerSplit[3].length() >= 4) {
                            year = springerSplit[3].substring(0, 4);
                        }
                    }
                } else {
                    if (DOI == null) {
                        pos = curString.indexOf("DOI");
                        if (pos < 0) {
                            pos = curString.indexOf(FieldName.DOI);
                        }
                        if (pos >= 0) {
                            pos += 3;
                            char delimiter = curString.charAt(pos);
                            if ((delimiter == ':') || (delimiter == ' ')) {
                                pos++;
                            }
                            int nextSpace = curString.indexOf(' ', pos);
                            if (nextSpace > 0) {
                                DOI = curString.substring(pos, nextSpace);
                            } else {
                                DOI = curString.substring(pos);
                            }
                        }
                    }

                    if ((publisher == null) && curString.contains("IEEE")) {
                        // IEEE has the conference things at the end
                        publisher = "IEEE";

                        // year is extracted by extractYear
                        // otherwise, we could it determine as follows:
                        // String yearStr = curString.substring(curString.length()-4);
                        // if (isYear(yearStr)) {
                        //	year = yearStr;
                        // }

                        if (conference == null) {
                            pos = curString.indexOf('$');
                            if (pos > 0) {
                                // we found the price
                                // before the price, the ISSN is stated
                                // skip that
                                pos -= 2;
                                while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
                                    pos--;
                                }
                                if (pos > 0) {
                                    conference = curString.substring(0, pos);
                                }
                            }
                        }
                    }
                }
            }

            BibEntry entry = new BibEntry();
            entry.setType(type);

            // TODO: institution parsing missing

            if (author != null) {
                entry.setField(FieldName.AUTHOR, author);
            }
            if (editor != null) {
                entry.setField(FieldName.EDITOR, editor);
            }
            if (abstractT != null) {
                entry.setField(FieldName.ABSTRACT, abstractT);
            }
            if (!Strings.isNullOrEmpty(keywords)) {
                entry.setField(FieldName.KEYWORDS, keywords);
            }
            if (title != null) {
                entry.setField(FieldName.TITLE, title);
            }
            if (conference != null) {
                entry.setField(FieldName.BOOKTITLE, conference);
            }
            if (DOI != null) {
                entry.setField(FieldName.DOI, DOI);
            }
            if (series != null) {
                entry.setField(FieldName.SERIES, series);
            }
            if (volume != null) {
                entry.setField(FieldName.VOLUME, volume);
            }
            if (number != null) {
                entry.setField(FieldName.NUMBER, number);
            }
            if (pages != null) {
                entry.setField(FieldName.PAGES, pages);
            }
            if (year != null) {
                entry.setField(FieldName.YEAR, year);
            }
            if (publisher != null) {
                entry.setField(FieldName.PUBLISHER, publisher);
            }

            result.add(entry);
        } catch (EncryptedPdfsNotSupportedException e) {
            return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
        } catch (IOException exception) {
            return ParserResult.fromError(exception);
        } catch (FetcherException e) {
            return ParserResult.fromErrorMessage(e.getMessage());
        }

        return new ParserResult(result);
    }

    private String getFirstPageContents(PDDocument document) throws IOException {
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setStartPage(1);
        stripper.setEndPage(1);
        stripper.setSortByPosition(true);
        stripper.setParagraphEnd(System.lineSeparator());
        StringWriter writer = new StringWriter();
        stripper.writeText(document, writer);

        return writer.toString();
    }

    /**
     * Extract the year out of curString (if it is not yet defined)
     */
    private void extractYear() {
        if (year != null) {
            return;
        }

        Matcher m = YEAR_EXTRACT_PATTERN.matcher(curString);
        if (m.find()) {
            year = curString.substring(m.start(), m.end());
        }

    }

    /**
     * PDFTextStripper normally does NOT produce multiple empty lines
     * (besides at strange PDFs). These strange PDFs are handled here:
     * proceed to next non-empty line
     */
    private void proceedToNextNonEmptyLine() {
        while ((i < lines.length) && "".equals(lines[i].trim())) {
            i++;
        }
    }

    /**
     * Fill curString with lines until "" is found
     * No trailing space is added
     * i is advanced to the next non-empty line (ignoring white space)
     * <p>
     * Lines containing only white spaces are ignored,
     * but NOT considered as ""
     * <p>
     * Uses GLOBAL variables lines, curLine, i
     */
    private void fillCurStringWithNonEmptyLines() {
        // ensure that curString does not end with " "
        curString = curString.trim();
        while ((i < lines.length) && !"".equals(lines[i])) {
            String curLine = lines[i].trim();
            if (!"".equals(curLine)) {
                if (!curString.isEmpty()) {
                    // insert separating space if necessary
                    curString = curString.concat(" ");
                }
                curString = curString.concat(lines[i]);
            }
            i++;
        }

        proceedToNextNonEmptyLine();
    }

    /**
     * resets curString
     * curString now contains the last block (until "" reached)
     * Trailing space is added
     * <p>
     * invariant before/after: i points to line before the last handled block
     */
    private void readLastBlock() {
        while ((i >= 0) && "".equals(lines[i].trim())) {
            i--;
        }
        // i is now at the end of a block

        int end = i;

        // find beginning
        while ((i >= 0) && !"".equals(lines[i])) {
            i--;
        }
        // i is now the line before the beginning of the block
        // this fulfills the invariant

        curString = "";
        for (int j = i + 1; j <= end; j++) {
            curString = curString.concat(lines[j].trim());
            if (j != end) {
                curString = curString.concat(" ");
            }
        }
    }

    @Override
    public String getName() {
        return "PDFcontent";
    }

    @Override
    public FileExtensions getExtensions() {
        return FileExtensions.PDF_CONTENT;
    }

    @Override
    public String getDescription() {
        return "PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.";
    }

}