PdfContentImporter.java example

Explorer
jabref-2.9.2-master
- src
/*
PdfContentImporter is part of JabRef. 
Copyright (C) 2011 Oliver Kopp

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
or see http://www.gnu.org/licenses/gpl-2.0.html
*/

package net.sf.jabref.imports;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.BibtexEntryType;
import net.sf.jabref.Globals;
import net.sf.jabref.OutputPrinter;
import net.sf.jabref.Util;

/**
 * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry.
 * 
 * Currently, Springer and IEEE formats are supported.
 * 
 * Integrating XMP support is future work
 * 
 * @author koppor
 *
 */
public class PdfContentImporter extends ImportFormat {
	
	private static Logger logger = Logger.getLogger(PdfContentImporter.class.getName());
	
	// we can store the DOItoBibTeXFetcher as single reference as the fetcher doesn't hold internal state
	private static DOItoBibTeXFetcher doiToBibTeXFetcher = new DOItoBibTeXFetcher();

	/* global variables holding the state of the current parse run
	 * needed to be able to generate methods such as "fillCurStringWithNonEmptyLines"
	 */
	
	// input split into several lines
	private String[] split; 
	
	// current index in split
    private int i;
    
    // curent "line" in split.
    // sometimes, a "line" is several lines in split
    private String curString;
    
	private String year = null;

	@Override
	public boolean isRecognizedFormat(InputStream in) throws IOException {
		return false;
	}

	/**
	 * Removes all non-letter characters at the end
	 * 
	 * EXCEPTION: a closing bracket is NOT removed
	 * 
	 * @param input
	 * @return
	 * TODO Additionally repalce multiple subsequent spaces by one space
	 */
	private String removeNonLettersAtEnd(String input) {
		input = input.trim();
		if (input.length() == 0)
			return input;
		char lastC = input.charAt(input.length()-1);
		while (!Character.isLetter(lastC) &&  (lastC!=')')) {
			// if there is an asterix, a dot or something else at the end: remove it
			input = input.substring(0, input.length()-1);
			if (input.length()>0) {
				lastC = input.charAt(input.length()-1);
			} else {
				break;
			}
		}
		return input;
	}
	
	private String streamlineNames(String names) {
		String res;
		// supported formats:
		//   Matthias Schrepfer1, Johannes Wolf1, Jan Mendling1, and Hajo A. Reijers2
		if (names.contains(",")) {
			String[] splitNames = names.split(",");
			res = "";
			boolean isFirst = true;
			for (int i=0; i<splitNames.length; i++) {
				String curName = removeNonLettersAtEnd(splitNames[i]);
				if (curName.indexOf("and")==0) {
					// skip possible ands between names
					curName = curName.substring(3).trim();
				} else {
					int posAnd = curName.indexOf(" and "); 
					if (posAnd>=0) {
						String nameBefore = curName.substring(0, posAnd);
						// cannot be first name as "," is contained in the string
						res = res.concat(" and ").concat(removeNonLettersAtEnd(nameBefore));
						curName = curName.substring(posAnd+5);
					}
				}
					
				if (!curName.equals("")) {
					if (curName.equalsIgnoreCase("et al."))
						curName = "others";
					if (isFirst) {
						isFirst = false;
					} else {
						res = res.concat(" and ");
					}
					res = res.concat(curName);
				}
			}
		} else {
			// assumption: names separated by space
			
			String[] splitNames = names.split(" ");
			if (splitNames.length == 0) {
				// empty names... something was really wrong...
				return "";
			}
			
			boolean workedOnFirstOrMiddle = false;
			boolean isFirst = true;
			int i=0;
			res = "";
			do {
				if (!workedOnFirstOrMiddle) {
					if (splitNames[i].equalsIgnoreCase("and")) {
						// do nothing, just increment i at the end of this iteration
					} else {
						if (isFirst) {
							isFirst = false;
						} else {
							res = res.concat(" and ");
						}
						if ((splitNames[i].equalsIgnoreCase("et")) && (splitNames.length>i+1) && (splitNames[i+1].equalsIgnoreCase("al."))) {
							res = res.concat("others");
							break;
						} else {
							res = res.concat(splitNames[i]).concat(" ");
							workedOnFirstOrMiddle = true;
						}
					}
				} else {
					// last item was a first or a middle name
					// we have to check whether we are on a middle name
					// if not, just add the item as last name and add an "and"
					if (splitNames[i].contains(".")) {
						// we found a middle name
						res = res.concat(splitNames[i]).concat(" ");
					} else {
						// last name found
						res = res.concat(removeNonLettersAtEnd(splitNames[i]));
						
						if (splitNames[i].length() > 0 && Character.isLowerCase(splitNames[i].charAt(0))) {
							// it is probably be "van", "vom", ...
							// we just rely on the fact that these things are written in lower case letters
							// do NOT finish name
							res = res.concat(" ");
						} else {
							// finish this name
							workedOnFirstOrMiddle = false;
						}
					}
				}
				i++;
			} while (i<splitNames.length);
			
		}
		return res;
	}
	
	private String streamlineTitle(String title) {
		return removeNonLettersAtEnd(title);
	}
	
	private boolean isYear(String yearStr) {
		try {
			Integer.parseInt(yearStr);
			return true;
		} catch (NumberFormatException e) {
			return false;
		}
	}
	
	@Override
	public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
		final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);
		
		PDDocument document = null;
		try {
			document = PDDocument.load(in);
		} catch (IOException e) {
			logger.log(Level.SEVERE, "Could not load document", e);
			return res;
		}

		try {
			if (document.isEncrypted()) {
				logger.log(Level.INFO,
						Globals.lang("Encrypted documents are not supported"));
				//return res;
			}

			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(1);
			stripper.setSortByPosition(true);
			stripper.setParagraphEnd(System.getProperty("line.separator"));
			StringWriter writer = new StringWriter();
			stripper.writeText(document, writer);
			String textResult = writer.toString();
			
			String doi = Util.getDOI(textResult);
			if (doi.length() < textResult.length()) {
				// A DOI was found in the text
				// We do NO parsing of the text, but use the DOI fetcher
				
				ImportInspector i = new ImportInspector() {
					@Override
					public void toFront() {
					}
					@Override
					public void setProgress(int current, int max) {
					}
					@Override
					public void addEntry(BibtexEntry entry) {
						// add the entry to the result object
						res.add(entry);
					}
				};
				doiToBibTeXFetcher.processQuery(doi, i, status);
				if (res.size() != 0) {
					// if something has been found, return the result
					return res;
				} else {
					// otherwise, we just parse the PDF
				}
			}

			String author = null;
			String editor = null;
			String institution = null;
			String abstractT = null;
			String keywords = null;
			String title = null;
			String conference = null;
			String DOI = null;
			String series = null;
			String volume = null;
			String number = null;
			String pages = null;
			// year is a class variable as the method extractYear() uses it;
			String publisher = null;
			BibtexEntryType type = BibtexEntryType.INPROCEEDINGS; 
			
			final String lineBreak = System.getProperty("line.separator");
			
			split = textResult.split(lineBreak);
			
			// idea: split[] contains the different lines
			// blocks are separated by empty lines
			// treat each block
			//   or do special treatment at authors (which are not broken)
			//   therefore, we do a line-based and not a block-based splitting
			// i points to the current line
			// curString (mostly) contains the current block
			//   the different lines are joined into one and thereby separated by " "
			
			proceedToNextNonEmptyLine();
			if (i>=split.length) {
				// PDF could not be parsed or is empty
				// return empty list
				return res;
			}
			curString = split[i];
			i = i+1;
			
			if (curString.length()>4) {
				// special case: possibly conference as first line on the page
				extractYear();
				if (curString.contains("Conference")) {
					fillCurStringWithNonEmptyLines();
					conference = curString;
					curString = "";
				} else {
					// e.g. Copyright (c) 1998 by the Genetics Society of America
					// future work: get year using RegEx
					String lower = curString.toLowerCase();
					if (lower.contains("copyright")) {
						fillCurStringWithNonEmptyLines();
						publisher = curString;
						curString = "";
					}
				}
			}
			
			// start: title
			fillCurStringWithNonEmptyLines();
			title = streamlineTitle(curString);
			curString = "";
			//i points to the next non-empty line
			
			// after title: authors
			author = null;
			while ((i<split.length)  && (!split[i].equals(""))) {
				// author names are unlikely to be split among different lines
				// treat them line by line
				curString = streamlineNames(split[i]);
				if (author==null) {
					author = curString;
				} else {
					if (curString.equals("")) {
						// if split[i] is "and" then "" is returned by streamlineNames -> do nothing
					} else {
						author = author.concat(" and ").concat(curString);
					}
				}
				i++;
			}			
			curString = "";
			i++;
			
			// then, abstract and keywords follow
			while (i<split.length) {
				curString = split[i];
				if ((curString.length()>="Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) {
					if (curString.length() == "Abstract".length()) {
						// only word "abstract" found -- skip line
						curString = "";
					} else {
						curString = curString.substring("Abstract".length()+1).trim().concat(lineBreak);
					}
					i++;
					// fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
					// whereas we need linebreak as separator
					while ((i<split.length)  && (!split[i].equals(""))) {
						curString = curString.concat(split[i]).concat(lineBreak);
						i++;
					}
					abstractT=curString;
					i++;
				} else if ((curString.length()>="Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) {
					if (curString.length() == "Keywords".length()) {
						// only word "Keywords" found -- skip line
						curString = "";
					} else {
						curString = curString.substring("Keywords".length()+1).trim();
					}
					i++;
					fillCurStringWithNonEmptyLines();
					keywords=removeNonLettersAtEnd(curString);
				} else {
					String lower = curString.toLowerCase();
					
					int pos = lower.indexOf("technical");
					if (pos>=0) {
						type = BibtexEntryType.TECHREPORT;
						pos = curString.trim().lastIndexOf(' ');
						if (pos>=0) {
							// assumption: last character of curString is NOT ' '
							//   otherwise pos+1 leads to an out-of-bounds exception
							number = curString.substring(pos+1);
						}
					}
					
					i++;
					proceedToNextNonEmptyLine();
				}
			}
			
			i = split.length-1;
			
			// last block: DOI, detailed information
			// sometimes, this information is in the third last block etc...
			// therefore, read until the beginning of the file 
			
			while (i>=0) {
				readLastBlock();
				// i now points to the block before or is -1
				// curString contains the last block, separated by " "
			
				extractYear();
				
				int pos = curString.indexOf("(Eds.)");
				if ((pos >= 0) && (publisher == null)) {
					// looks like a Springer last line
					// e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
					publisher = "Springer";
					editor = streamlineNames(curString.substring(0, pos - 1));
					curString = curString.substring(pos+"(Eds.)".length()+2); //+2 because of ":" after (Eds.) and the subsequent space
					String[] springerSplit = curString.split(", ");
					if (springerSplit.length >= 4) {
						conference = springerSplit[0];

						String seriesData = springerSplit[1];
						int lastSpace = seriesData.lastIndexOf(' ');
						series = seriesData.substring(0, lastSpace);
						volume = seriesData.substring(lastSpace + 1);
						
						pages = springerSplit[2].substring(4);
						
						if (springerSplit[3].length()>=4) {
							year = springerSplit[3].substring(0,4);
						}
					}
				} else {
					if (DOI==null) {
						pos = curString.indexOf("DOI");
						if (pos < 0) pos = curString.indexOf("doi");
						if (pos>=0) {
							pos += 3;
							char delimiter = curString.charAt(pos);
							if ((delimiter == ':') || (delimiter == ' ')) {
								pos++;
							}
							int nextSpace = curString.indexOf(' ', pos);
							if (nextSpace > 0)
								DOI = curString.substring(pos, nextSpace);
							else
								DOI = curString.substring(pos);
						}
					}
					
					if ((publisher==null) && (curString.indexOf("IEEE")>=0)) {
						// IEEE has the conference things at the end
						publisher = "IEEE";

						// year is extracted by extractYear
						// otherwise, we could it determine as follows: 
						// String yearStr = curString.substring(curString.length()-4);
						// if (isYear(yearStr)) {
						//	year = yearStr;
						// }
						
						if (conference == null) {
							pos = curString.indexOf('$');
							if (pos>0) {
								// we found the price
								// before the price, the ISSN is stated
								// skip that
								pos -= 2;
								while ((pos>=0) && (curString.charAt(pos) != ' '))
									pos--;
								if (pos>0) {
									conference = curString.substring(0,pos);
								}
							}
						}
					}
					
//					String lower = curString.toLowerCase();
//					if (institution == null) {
//						
//					}
					
				}
			}

			BibtexEntry entry = new BibtexEntry();
			entry.setType(type);
			
			if (author!=null) entry.setField("author", author);
			if (editor!=null) entry.setField("editor", editor);
			if (institution!=null) entry.setField("institution", institution);
			if (abstractT!=null) entry.setField("abstract", abstractT);
			if (keywords!=null) entry.setField("keywords", keywords);
			if (title!=null) entry.setField("title", title);
			if (conference!=null) entry.setField("booktitle", conference);
			if (DOI!=null) entry.setField("doi", DOI);
			if (series!=null) entry.setField("series", series);
			if (volume!=null) entry.setField("volume", volume);
			if (number!=null) entry.setField("number", number);
			if (pages!=null) entry.setField("pages", pages);
			if (year!=null) entry.setField("year", year);
			if (publisher!=null) entry.setField("publisher", publisher);
			
			entry.setField("review", textResult);

			res.add(entry);
		} catch (NoClassDefFoundError e) {
			if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
				status.showMessage(Globals.lang("Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
			} else {
				logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
			}
		} finally {
			document.close();
		}
		
		return res;
	}

	
	
	/**
	 * Extract the year out of curString (if it is not yet defined)
	 */
	private void extractYear() {
		if (year != null)
			return;
		
		final Pattern p = Pattern.compile("\\d\\d\\d\\d");
		Matcher m = p.matcher(curString);
		if (m.find()) {
			year = curString.substring(m.start(), m.end());
		}
		
	}

	/**
	 * PDFTextStripper normally does NOT produce multiple empty lines 
	 * (besides at strange PDFs). These strange PDFs are handled here:
	 * proceed to next non-empty line
	 */
	private void proceedToNextNonEmptyLine() {
		while ((i<split.length) && (split[i].trim().equals(""))) {
			i++;
		}
	}
	

	/**
	 * Fill curString with lines until "" is found
	 * No trailing space is added
	 * i is advanced to the next non-empty line (ignoring white space)
	 * 
	 * Lines containing only white spaces are ignored,
	 * but NOT considered as ""
	 * 
	 * Uses GLOBAL variables split, curLine, i
	 */
	private void fillCurStringWithNonEmptyLines() {
		// ensure that curString does not end with " "
		curString = curString.trim();
		while ((i<split.length)  && (!split[i].equals(""))) {
			String curLine = split[i].trim();
			if (!curLine.equals("")) {
				if (curString.length()>0) {
					// insert separating space if necessary
					curString = curString.concat(" ");
				}
				curString = curString.concat(split[i]);
			}
			i++;
		}

		proceedToNextNonEmptyLine();
	}

	/**
	 * resets curString
	 * curString now contains the last block (until "" reached)
	 * Trailing space is added
	 * 
	 * invariant before/after: i points to line before the last handled block
	 */
	private void readLastBlock() {
		while ((i>=0) && (split[i].trim().equals(""))) {
			i--;
		}
		// i is now at the end of a block
		
		int end = i;
		
		// find beginning
		while ((i>=0) && (!split[i].equals(""))) {
			i--;
		}
		// i is now the line before the beginning of the block
		// this fulfills the invariant
		
		curString = "";
		for (int j = i+1; j<=end; j++) {
			curString = curString.concat(split[j].trim());
			if (j!=end) {
				curString = curString.concat(" ");
			}
		}
	}

	@Override
	public String getFormatName() {
		return "PDFcontent";
	}

}