package org.jabref.logic.importer.fileformat;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fetcher.DoiFetcher;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileExtensions;
import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
import org.jabref.logic.xmp.XMPUtil;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibtexEntryTypes;
import org.jabref.model.entry.EntryType;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.identifier.DOI;
import com.google.common.base.Strings;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry.
* <p>
* Currently, Springer and IEEE formats are supported.
* <p>
* Integrating XMP support is future work
*/
public class PdfContentImporter extends Importer {
private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");
private final ImportFormatPreferences importFormatPreferences;
// input lines into several lines
private String[] lines;
// current index in lines
private int i;
private String curString;
private String year;
public PdfContentImporter(ImportFormatPreferences importFormatPreferences) {
this.importFormatPreferences = importFormatPreferences;
}
/**
* Removes all non-letter characters at the end
* <p>
* EXCEPTION: a closing bracket is NOT removed
* </p>
* <p>
* TODO: Additionally replace multiple subsequent spaces by one space, which will cause a rename of this method
* </p>
*/
private static String removeNonLettersAtEnd(String input) {
String result = input.trim();
if (result.isEmpty()) {
return result;
}
char lastC = result.charAt(result.length() - 1);
while (!Character.isLetter(lastC) && (lastC != ')')) {
// if there is an asterix, a dot or something else at the end: remove it
result = result.substring(0, result.length() - 1);
if (result.isEmpty()) {
break;
} else {
lastC = result.charAt(result.length() - 1);
}
}
return result;
}
private static String streamlineNames(String names) {
// TODO: replace with NormalizeNamesFormatter?!
String res;
// supported formats:
// Matthias Schrepfer1, Johannes Wolf1, Jan Mendling1, and Hajo A. Reijers2
if (names.contains(",")) {
String[] splitNames = names.split(",");
res = "";
boolean isFirst = true;
for (String splitName : splitNames) {
String curName = removeNonLettersAtEnd(splitName);
if (curName.indexOf("and") == 0) {
// skip possible ands between names
curName = curName.substring(3).trim();
} else {
int posAnd = curName.indexOf(" and ");
if (posAnd >= 0) {
String nameBefore = curName.substring(0, posAnd);
// cannot be first name as "," is contained in the string
res = res.concat(" and ").concat(removeNonLettersAtEnd(nameBefore));
curName = curName.substring(posAnd + 5);
}
}
if (!"".equals(curName)) {
if ("et al.".equalsIgnoreCase(curName)) {
curName = "others";
}
if (isFirst) {
isFirst = false;
} else {
res = res.concat(" and ");
}
res = res.concat(curName);
}
}
} else {
// assumption: names separated by space
String[] splitNames = names.split(" ");
if (splitNames.length == 0) {
// empty names... something was really wrong...
return "";
}
boolean workedOnFirstOrMiddle = false;
boolean isFirst = true;
int i = 0;
res = "";
do {
if (workedOnFirstOrMiddle) {
// last item was a first or a middle name
// we have to check whether we are on a middle name
// if not, just add the item as last name and add an "and"
if (splitNames[i].contains(".")) {
// we found a middle name
res = res.concat(splitNames[i]).concat(" ");
} else {
// last name found
res = res.concat(removeNonLettersAtEnd(splitNames[i]));
if (!splitNames[i].isEmpty() && Character.isLowerCase(splitNames[i].charAt(0))) {
// it is probably be "van", "vom", ...
// we just rely on the fact that these things are written in lower case letters
// do NOT finish name
res = res.concat(" ");
} else {
// finish this name
workedOnFirstOrMiddle = false;
}
}
} else {
if ("and".equalsIgnoreCase(splitNames[i])) {
// do nothing, just increment i at the end of this iteration
} else {
if (isFirst) {
isFirst = false;
} else {
res = res.concat(" and ");
}
if ("et".equalsIgnoreCase(splitNames[i]) && (splitNames.length > (i + 1))
&& "al.".equalsIgnoreCase(splitNames[i + 1])) {
res = res.concat("others");
break;
} else {
res = res.concat(splitNames[i]).concat(" ");
workedOnFirstOrMiddle = true;
}
}
}
i++;
} while (i < splitNames.length);
}
return res;
}
private static String streamlineTitle(String title) {
return removeNonLettersAtEnd(title);
}
@Override
public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
Objects.requireNonNull(reader);
return false;
}
@Override
public ParserResult importDatabase(BufferedReader reader) throws IOException {
Objects.requireNonNull(reader);
throw new UnsupportedOperationException(
"PdfContentImporter does not support importDatabase(BufferedReader reader)."
+ "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
}
@Override
public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
final ArrayList<BibEntry> result = new ArrayList<>(1);
try (FileInputStream fileStream = new FileInputStream(filePath.toFile());
PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) {
String firstPageContents = getFirstPageContents(document);
Optional<DOI> doi = DOI.findInText(firstPageContents);
if (doi.isPresent()) {
ParserResult parserResult = new ParserResult(result);
Optional<BibEntry> entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI());
entry.ifPresent(parserResult.getDatabase()::insertEntry);
return parserResult;
}
// idea: split[] contains the different lines
// blocks are separated by empty lines
// treat each block
// or do special treatment at authors (which are not broken)
// therefore, we do a line-based and not a block-based splitting
// i points to the current line
// curString (mostly) contains the current block
// the different lines are joined into one and thereby separated by " "
lines = firstPageContents.split(System.lineSeparator());
proceedToNextNonEmptyLine();
if (i >= lines.length) {
// PDF could not be parsed or is empty
// return empty list
return new ParserResult();
}
// we start at the current line
curString = lines[i];
// i might get incremented later and curString modified, too
i = i + 1;
String author;
String editor = null;
String abstractT = null;
String keywords = null;
String title;
String conference = null;
String DOI = null;
String series = null;
String volume = null;
String number = null;
String pages = null;
// year is a class variable as the method extractYear() uses it;
String publisher = null;
EntryType type = BibtexEntryTypes.INPROCEEDINGS;
if (curString.length() > 4) {
// special case: possibly conference as first line on the page
extractYear();
if (curString.contains("Conference")) {
fillCurStringWithNonEmptyLines();
conference = curString;
curString = "";
} else {
// e.g. Copyright (c) 1998 by the Genetics Society of America
// future work: get year using RegEx
String lower = curString.toLowerCase(Locale.ROOT);
if (lower.contains("copyright")) {
fillCurStringWithNonEmptyLines();
publisher = curString;
curString = "";
}
}
}
// start: title
fillCurStringWithNonEmptyLines();
title = streamlineTitle(curString);
curString = "";
//i points to the next non-empty line
// after title: authors
author = null;
while ((i < lines.length) && !"".equals(lines[i])) {
// author names are unlikely to be lines among different lines
// treat them line by line
curString = streamlineNames(lines[i]);
if (author == null) {
author = curString;
} else {
if ("".equals(curString)) {
// if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
} else {
author = author.concat(" and ").concat(curString);
}
}
i++;
}
curString = "";
i++;
// then, abstract and keywords follow
while (i < lines.length) {
curString = lines[i];
if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) {
if (curString.length() == "Abstract".length()) {
// only word "abstract" found -- skip line
curString = "";
} else {
curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator());
}
i++;
// fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
// whereas we need linebreak as separator
while ((i < lines.length) && !"".equals(lines[i])) {
curString = curString.concat(lines[i]).concat(System.lineSeparator());
i++;
}
abstractT = curString.trim();
i++;
} else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) {
if (curString.length() == "Keywords".length()) {
// only word "Keywords" found -- skip line
curString = "";
} else {
curString = curString.substring("Keywords".length() + 1).trim();
}
i++;
fillCurStringWithNonEmptyLines();
keywords = removeNonLettersAtEnd(curString);
} else {
String lower = curString.toLowerCase(Locale.ROOT);
int pos = lower.indexOf("technical");
if (pos >= 0) {
type = BibtexEntryTypes.TECHREPORT;
pos = curString.trim().lastIndexOf(' ');
if (pos >= 0) {
// assumption: last character of curString is NOT ' '
// otherwise pos+1 leads to an out-of-bounds exception
number = curString.substring(pos + 1);
}
}
i++;
proceedToNextNonEmptyLine();
}
}
i = lines.length - 1;
// last block: DOI, detailed information
// sometimes, this information is in the third last block etc...
// therefore, read until the beginning of the file
while (i >= 0) {
readLastBlock();
// i now points to the block before or is -1
// curString contains the last block, separated by " "
extractYear();
int pos = curString.indexOf("(Eds.)");
if ((pos >= 0) && (publisher == null)) {
// looks like a Springer last line
// e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
publisher = "Springer";
editor = streamlineNames(curString.substring(0, pos - 1));
curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space
String[] springerSplit = curString.split(", ");
if (springerSplit.length >= 4) {
conference = springerSplit[0];
String seriesData = springerSplit[1];
int lastSpace = seriesData.lastIndexOf(' ');
series = seriesData.substring(0, lastSpace);
volume = seriesData.substring(lastSpace + 1);
pages = springerSplit[2].substring(4);
if (springerSplit[3].length() >= 4) {
year = springerSplit[3].substring(0, 4);
}
}
} else {
if (DOI == null) {
pos = curString.indexOf("DOI");
if (pos < 0) {
pos = curString.indexOf(FieldName.DOI);
}
if (pos >= 0) {
pos += 3;
char delimiter = curString.charAt(pos);
if ((delimiter == ':') || (delimiter == ' ')) {
pos++;
}
int nextSpace = curString.indexOf(' ', pos);
if (nextSpace > 0) {
DOI = curString.substring(pos, nextSpace);
} else {
DOI = curString.substring(pos);
}
}
}
if ((publisher == null) && curString.contains("IEEE")) {
// IEEE has the conference things at the end
publisher = "IEEE";
// year is extracted by extractYear
// otherwise, we could it determine as follows:
// String yearStr = curString.substring(curString.length()-4);
// if (isYear(yearStr)) {
// year = yearStr;
// }
if (conference == null) {
pos = curString.indexOf('$');
if (pos > 0) {
// we found the price
// before the price, the ISSN is stated
// skip that
pos -= 2;
while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
pos--;
}
if (pos > 0) {
conference = curString.substring(0, pos);
}
}
}
}
}
}
BibEntry entry = new BibEntry();
entry.setType(type);
// TODO: institution parsing missing
if (author != null) {
entry.setField(FieldName.AUTHOR, author);
}
if (editor != null) {
entry.setField(FieldName.EDITOR, editor);
}
if (abstractT != null) {
entry.setField(FieldName.ABSTRACT, abstractT);
}
if (!Strings.isNullOrEmpty(keywords)) {
entry.setField(FieldName.KEYWORDS, keywords);
}
if (title != null) {
entry.setField(FieldName.TITLE, title);
}
if (conference != null) {
entry.setField(FieldName.BOOKTITLE, conference);
}
if (DOI != null) {
entry.setField(FieldName.DOI, DOI);
}
if (series != null) {
entry.setField(FieldName.SERIES, series);
}
if (volume != null) {
entry.setField(FieldName.VOLUME, volume);
}
if (number != null) {
entry.setField(FieldName.NUMBER, number);
}
if (pages != null) {
entry.setField(FieldName.PAGES, pages);
}
if (year != null) {
entry.setField(FieldName.YEAR, year);
}
if (publisher != null) {
entry.setField(FieldName.PUBLISHER, publisher);
}
result.add(entry);
} catch (EncryptedPdfsNotSupportedException e) {
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
} catch (IOException exception) {
return ParserResult.fromError(exception);
} catch (FetcherException e) {
return ParserResult.fromErrorMessage(e.getMessage());
}
return new ParserResult(result);
}
private String getFirstPageContents(PDDocument document) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(1);
stripper.setEndPage(1);
stripper.setSortByPosition(true);
stripper.setParagraphEnd(System.lineSeparator());
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
return writer.toString();
}
/**
* Extract the year out of curString (if it is not yet defined)
*/
private void extractYear() {
if (year != null) {
return;
}
Matcher m = YEAR_EXTRACT_PATTERN.matcher(curString);
if (m.find()) {
year = curString.substring(m.start(), m.end());
}
}
/**
* PDFTextStripper normally does NOT produce multiple empty lines
* (besides at strange PDFs). These strange PDFs are handled here:
* proceed to next non-empty line
*/
private void proceedToNextNonEmptyLine() {
while ((i < lines.length) && "".equals(lines[i].trim())) {
i++;
}
}
/**
* Fill curString with lines until "" is found
* No trailing space is added
* i is advanced to the next non-empty line (ignoring white space)
* <p>
* Lines containing only white spaces are ignored,
* but NOT considered as ""
* <p>
* Uses GLOBAL variables lines, curLine, i
*/
private void fillCurStringWithNonEmptyLines() {
// ensure that curString does not end with " "
curString = curString.trim();
while ((i < lines.length) && !"".equals(lines[i])) {
String curLine = lines[i].trim();
if (!"".equals(curLine)) {
if (!curString.isEmpty()) {
// insert separating space if necessary
curString = curString.concat(" ");
}
curString = curString.concat(lines[i]);
}
i++;
}
proceedToNextNonEmptyLine();
}
/**
* resets curString
* curString now contains the last block (until "" reached)
* Trailing space is added
* <p>
* invariant before/after: i points to line before the last handled block
*/
private void readLastBlock() {
while ((i >= 0) && "".equals(lines[i].trim())) {
i--;
}
// i is now at the end of a block
int end = i;
// find beginning
while ((i >= 0) && !"".equals(lines[i])) {
i--;
}
// i is now the line before the beginning of the block
// this fulfills the invariant
curString = "";
for (int j = i + 1; j <= end; j++) {
curString = curString.concat(lines[j].trim());
if (j != end) {
curString = curString.concat(" ");
}
}
}
@Override
public String getName() {
return "PDFcontent";
}
@Override
public FileExtensions getExtensions() {
return FileExtensions.PDF_CONTENT;
}
@Override
public String getDescription() {
return "PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.";
}
}