/* PdfContentImporter is part of JabRef. Copyright (C) 2011 Oliver Kopp This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA or see http://www.gnu.org/licenses/gpl-2.0.html */ package net.sf.jabref.imports; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import net.sf.jabref.BibtexEntry; import net.sf.jabref.BibtexEntryType; import net.sf.jabref.Globals; import net.sf.jabref.OutputPrinter; import net.sf.jabref.Util; /** * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. * * Currently, Springer and IEEE formats are supported. * * Integrating XMP support is future work * * @author koppor * */ public class PdfContentImporter extends ImportFormat { private static Logger logger = Logger.getLogger(PdfContentImporter.class.getName()); // we can store the DOItoBibTeXFetcher as single reference as the fetcher doesn't hold internal state private static DOItoBibTeXFetcher doiToBibTeXFetcher = new DOItoBibTeXFetcher(); /* global variables holding the state of the current parse run * needed to be able to generate methods such as "fillCurStringWithNonEmptyLines" */ // input split into several lines private String[] split; // current index in split private int i; // curent "line" in split. // sometimes, a "line" is several lines in split private String curString; private String year = null; @Override public boolean isRecognizedFormat(InputStream in) throws IOException { return false; } /** * Removes all non-letter characters at the end * * EXCEPTION: a closing bracket is NOT removed * * @param input * @return * TODO Additionally repalce multiple subsequent spaces by one space */ private String removeNonLettersAtEnd(String input) { input = input.trim(); if (input.length() == 0) return input; char lastC = input.charAt(input.length()-1); while (!Character.isLetter(lastC) && (lastC!=')')) { // if there is an asterix, a dot or something else at the end: remove it input = input.substring(0, input.length()-1); if (input.length()>0) { lastC = input.charAt(input.length()-1); } else { break; } } return input; } private String streamlineNames(String names) { String res; // supported formats: // Matthias Schrepfer1, Johannes Wolf1, Jan Mendling1, and Hajo A. Reijers2 if (names.contains(",")) { String[] splitNames = names.split(","); res = ""; boolean isFirst = true; for (int i=0; i<splitNames.length; i++) { String curName = removeNonLettersAtEnd(splitNames[i]); if (curName.indexOf("and")==0) { // skip possible ands between names curName = curName.substring(3).trim(); } else { int posAnd = curName.indexOf(" and "); if (posAnd>=0) { String nameBefore = curName.substring(0, posAnd); // cannot be first name as "," is contained in the string res = res.concat(" and ").concat(removeNonLettersAtEnd(nameBefore)); curName = curName.substring(posAnd+5); } } if (!curName.equals("")) { if (curName.equalsIgnoreCase("et al.")) curName = "others"; if (isFirst) { isFirst = false; } else { res = res.concat(" and "); } res = res.concat(curName); } } } else { // assumption: names separated by space String[] splitNames = names.split(" "); if (splitNames.length == 0) { // empty names... something was really wrong... return ""; } boolean workedOnFirstOrMiddle = false; boolean isFirst = true; int i=0; res = ""; do { if (!workedOnFirstOrMiddle) { if (splitNames[i].equalsIgnoreCase("and")) { // do nothing, just increment i at the end of this iteration } else { if (isFirst) { isFirst = false; } else { res = res.concat(" and "); } if ((splitNames[i].equalsIgnoreCase("et")) && (splitNames.length>i+1) && (splitNames[i+1].equalsIgnoreCase("al."))) { res = res.concat("others"); break; } else { res = res.concat(splitNames[i]).concat(" "); workedOnFirstOrMiddle = true; } } } else { // last item was a first or a middle name // we have to check whether we are on a middle name // if not, just add the item as last name and add an "and" if (splitNames[i].contains(".")) { // we found a middle name res = res.concat(splitNames[i]).concat(" "); } else { // last name found res = res.concat(removeNonLettersAtEnd(splitNames[i])); if (splitNames[i].length() > 0 && Character.isLowerCase(splitNames[i].charAt(0))) { // it is probably be "van", "vom", ... // we just rely on the fact that these things are written in lower case letters // do NOT finish name res = res.concat(" "); } else { // finish this name workedOnFirstOrMiddle = false; } } } i++; } while (i<splitNames.length); } return res; } private String streamlineTitle(String title) { return removeNonLettersAtEnd(title); } private boolean isYear(String yearStr) { try { Integer.parseInt(yearStr); return true; } catch (NumberFormatException e) { return false; } } @Override public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException { final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1); PDDocument document = null; try { document = PDDocument.load(in); } catch (IOException e) { logger.log(Level.SEVERE, "Could not load document", e); return res; } try { if (document.isEncrypted()) { logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported")); //return res; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(1); stripper.setSortByPosition(true); stripper.setParagraphEnd(System.getProperty("line.separator")); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); String textResult = writer.toString(); String doi = Util.getDOI(textResult); if (doi.length() < textResult.length()) { // A DOI was found in the text // We do NO parsing of the text, but use the DOI fetcher ImportInspector i = new ImportInspector() { @Override public void toFront() { } @Override public void setProgress(int current, int max) { } @Override public void addEntry(BibtexEntry entry) { // add the entry to the result object res.add(entry); } }; doiToBibTeXFetcher.processQuery(doi, i, status); if (res.size() != 0) { // if something has been found, return the result return res; } else { // otherwise, we just parse the PDF } } String author = null; String editor = null; String institution = null; String abstractT = null; String keywords = null; String title = null; String conference = null; String DOI = null; String series = null; String volume = null; String number = null; String pages = null; // year is a class variable as the method extractYear() uses it; String publisher = null; BibtexEntryType type = BibtexEntryType.INPROCEEDINGS; final String lineBreak = System.getProperty("line.separator"); split = textResult.split(lineBreak); // idea: split[] contains the different lines // blocks are separated by empty lines // treat each block // or do special treatment at authors (which are not broken) // therefore, we do a line-based and not a block-based splitting // i points to the current line // curString (mostly) contains the current block // the different lines are joined into one and thereby separated by " " proceedToNextNonEmptyLine(); if (i>=split.length) { // PDF could not be parsed or is empty // return empty list return res; } curString = split[i]; i = i+1; if (curString.length()>4) { // special case: possibly conference as first line on the page extractYear(); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; curString = ""; } else { // e.g. Copyright (c) 1998 by the Genetics Society of America // future work: get year using RegEx String lower = curString.toLowerCase(); if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); publisher = curString; curString = ""; } } } // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); curString = ""; //i points to the next non-empty line // after title: authors author = null; while ((i<split.length) && (!split[i].equals(""))) { // author names are unlikely to be split among different lines // treat them line by line curString = streamlineNames(split[i]); if (author==null) { author = curString; } else { if (curString.equals("")) { // if split[i] is "and" then "" is returned by streamlineNames -> do nothing } else { author = author.concat(" and ").concat(curString); } } i++; } curString = ""; i++; // then, abstract and keywords follow while (i<split.length) { curString = split[i]; if ((curString.length()>="Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) { if (curString.length() == "Abstract".length()) { // only word "abstract" found -- skip line curString = ""; } else { curString = curString.substring("Abstract".length()+1).trim().concat(lineBreak); } i++; // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator // whereas we need linebreak as separator while ((i<split.length) && (!split[i].equals(""))) { curString = curString.concat(split[i]).concat(lineBreak); i++; } abstractT=curString; i++; } else if ((curString.length()>="Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) { if (curString.length() == "Keywords".length()) { // only word "Keywords" found -- skip line curString = ""; } else { curString = curString.substring("Keywords".length()+1).trim(); } i++; fillCurStringWithNonEmptyLines(); keywords=removeNonLettersAtEnd(curString); } else { String lower = curString.toLowerCase(); int pos = lower.indexOf("technical"); if (pos>=0) { type = BibtexEntryType.TECHREPORT; pos = curString.trim().lastIndexOf(' '); if (pos>=0) { // assumption: last character of curString is NOT ' ' // otherwise pos+1 leads to an out-of-bounds exception number = curString.substring(pos+1); } } i++; proceedToNextNonEmptyLine(); } } i = split.length-1; // last block: DOI, detailed information // sometimes, this information is in the third last block etc... // therefore, read until the beginning of the file while (i>=0) { readLastBlock(); // i now points to the block before or is -1 // curString contains the last block, separated by " " extractYear(); int pos = curString.indexOf("(Eds.)"); if ((pos >= 0) && (publisher == null)) { // looks like a Springer last line // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. publisher = "Springer"; editor = streamlineNames(curString.substring(0, pos - 1)); curString = curString.substring(pos+"(Eds.)".length()+2); //+2 because of ":" after (Eds.) and the subsequent space String[] springerSplit = curString.split(", "); if (springerSplit.length >= 4) { conference = springerSplit[0]; String seriesData = springerSplit[1]; int lastSpace = seriesData.lastIndexOf(' '); series = seriesData.substring(0, lastSpace); volume = seriesData.substring(lastSpace + 1); pages = springerSplit[2].substring(4); if (springerSplit[3].length()>=4) { year = springerSplit[3].substring(0,4); } } } else { if (DOI==null) { pos = curString.indexOf("DOI"); if (pos < 0) pos = curString.indexOf("doi"); if (pos>=0) { pos += 3; char delimiter = curString.charAt(pos); if ((delimiter == ':') || (delimiter == ' ')) { pos++; } int nextSpace = curString.indexOf(' ', pos); if (nextSpace > 0) DOI = curString.substring(pos, nextSpace); else DOI = curString.substring(pos); } } if ((publisher==null) && (curString.indexOf("IEEE")>=0)) { // IEEE has the conference things at the end publisher = "IEEE"; // year is extracted by extractYear // otherwise, we could it determine as follows: // String yearStr = curString.substring(curString.length()-4); // if (isYear(yearStr)) { // year = yearStr; // } if (conference == null) { pos = curString.indexOf('$'); if (pos>0) { // we found the price // before the price, the ISSN is stated // skip that pos -= 2; while ((pos>=0) && (curString.charAt(pos) != ' ')) pos--; if (pos>0) { conference = curString.substring(0,pos); } } } } // String lower = curString.toLowerCase(); // if (institution == null) { // // } } } BibtexEntry entry = new BibtexEntry(); entry.setType(type); if (author!=null) entry.setField("author", author); if (editor!=null) entry.setField("editor", editor); if (institution!=null) entry.setField("institution", institution); if (abstractT!=null) entry.setField("abstract", abstractT); if (keywords!=null) entry.setField("keywords", keywords); if (title!=null) entry.setField("title", title); if (conference!=null) entry.setField("booktitle", conference); if (DOI!=null) entry.setField("doi", DOI); if (series!=null) entry.setField("series", series); if (volume!=null) entry.setField("volume", volume); if (number!=null) entry.setField("number", number); if (pages!=null) entry.setField("pages", pages); if (year!=null) entry.setField("year", year); if (publisher!=null) entry.setField("publisher", publisher); entry.setField("review", textResult); res.add(entry); } catch (NoClassDefFoundError e) { if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) { status.showMessage(Globals.lang("Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/.")); } else { logger.log(Level.SEVERE, e.getLocalizedMessage(), e); } } finally { document.close(); } return res; } /** * Extract the year out of curString (if it is not yet defined) */ private void extractYear() { if (year != null) return; final Pattern p = Pattern.compile("\\d\\d\\d\\d"); Matcher m = p.matcher(curString); if (m.find()) { year = curString.substring(m.start(), m.end()); } } /** * PDFTextStripper normally does NOT produce multiple empty lines * (besides at strange PDFs). These strange PDFs are handled here: * proceed to next non-empty line */ private void proceedToNextNonEmptyLine() { while ((i<split.length) && (split[i].trim().equals(""))) { i++; } } /** * Fill curString with lines until "" is found * No trailing space is added * i is advanced to the next non-empty line (ignoring white space) * * Lines containing only white spaces are ignored, * but NOT considered as "" * * Uses GLOBAL variables split, curLine, i */ private void fillCurStringWithNonEmptyLines() { // ensure that curString does not end with " " curString = curString.trim(); while ((i<split.length) && (!split[i].equals(""))) { String curLine = split[i].trim(); if (!curLine.equals("")) { if (curString.length()>0) { // insert separating space if necessary curString = curString.concat(" "); } curString = curString.concat(split[i]); } i++; } proceedToNextNonEmptyLine(); } /** * resets curString * curString now contains the last block (until "" reached) * Trailing space is added * * invariant before/after: i points to line before the last handled block */ private void readLastBlock() { while ((i>=0) && (split[i].trim().equals(""))) { i--; } // i is now at the end of a block int end = i; // find beginning while ((i>=0) && (!split[i].equals(""))) { i--; } // i is now the line before the beginning of the block // this fulfills the invariant curString = ""; for (int j = i+1; j<=end; j++) { curString = curString.concat(split[j].trim()); if (j!=end) { curString = curString.concat(" "); } } } @Override public String getFormatName() { return "PDFcontent"; } }