CsaImporter.java example

Explorer
Desktop-master
package net.sf.jabref.imports;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.Globals;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import net.sf.jabref.BibtexFields;


/**
 * Importer for records downloaded from CSA: Cambridge Scientific Abstracts
 * in full text format.  Although the same basic format is used by all CSA
 * databases, this importer has been tailored and tested to handle
 * ASFA: Aquatic Sciences and Fisheries records.
 *
 * @author John Relph
 */
public class CsaImporter extends ImportFormat {

    // local fields
    private int line;

    // pre-compiled patterns
    private final static Pattern FIELD_PATTERN =
        Pattern.compile("^([A-Z][A-Z]): ([A-Z].*)$");
    private final static Pattern VOLNOPP_PATTERN =
        Pattern.compile("[;,\\.]\\s+(\\d+[A-Za-z]?)\\((\\d+(?:-\\d+)?)\\)(?:,\\s+|:)(\\d+-\\d+)");
    private final static Pattern PAGES_PATTERN =
        Pattern.compile("[;,\\.]\\s+(?:(\\[?[vn]\\.?p\\.?\\]?)|(?:pp?\\.?\\s+)(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)|(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)(?:\\s+pp?))");
    private final static Pattern VOLUME_PATTERN =
        Pattern.compile("[;,\\.]?\\s+[vV][oO][lL]\\.?\\s+(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)");
    private final static Pattern NUMBER_PATTERN =
        Pattern.compile("[;,\\.]\\s+(?:No|no|Part|part|NUMB)\\.?\\s+([A-Z]?\\d+(?:[/-]\\d+)?)");
    private final static Pattern DATE_PATTERN =
        Pattern.compile("[;,\\.]\\s+(?:(\\d+)\\s)?(?:([A-Z][a-z][a-z])[\\.,]*\\s)?\\(?(\\d\\d\\d\\d)\\)?(?:\\s([A-Z][a-z][a-z]))?(?:\\s+(\\d+))?");
    private final static Pattern LT_PATTERN =
        Pattern.compile("\\[Lt\\]");

    // other constants
    private static final String MONS =
        "jan feb mar apr may jun jul aug sep oct nov dec";
    private static final String[] MONTHS =
        { "January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December" };

    /**
     * Return the name of this import format.
     */
    public String getFormatName() {
        return "CSA";
    }

    /*
     *  (non-Javadoc)
     * @see net.sf.jabref.imports.ImportFormat#getCLIId()
     */
    public String getCLIId() {
      return "csa";
    }

    // read a line
    private String readLine(BufferedReader file) throws IOException {
        String str = file.readLine();
        if (str != null)
            line++;
        return str;
    }

    // append to the "note" field
    private void addNote(HashMap<String, String> hm, String note) {

        StringBuffer notebuf = new StringBuffer();
        if (hm.get("note") != null) {
            notebuf.append(hm.get("note"));
            notebuf.append("\n");
        }
        notebuf.append(note);
        hm.put("note", notebuf.toString());
    }

    // parse the date from the Source field
    private String parseDate(HashMap<String, String> hm, String fstr) {

        // find LAST matching date in string
        int match = -1;
        Matcher pm = DATE_PATTERN.matcher(fstr);
        while (pm.find()) {
            match = pm.start();
//	    System.out.println("MATCH: " + match + ": " + pm.group(0));
        }

        if (match == -1) {
//	    System.out.println("NO MATCH: \"" + fstr + "\"");
            return fstr;
        }

        if (!pm.find(match)) {
//	    System.out.println("MATCH FAILED: \"" + fstr + "\"");
            return fstr;
        }

        StringBuffer date = new StringBuffer();

        String day = pm.group(1);
        if (day == null)
            day = pm.group(5);
        else if (pm.group(5) != null)
            return fstr;	// possible day found in two places

        if (day != null && !day.equals("0")) {
            date.append(day);
            date.append(" ");
        } else
            day = null;

        String mon = pm.group(2);
        if (mon == null)
            mon = pm.group(4);
        else if (pm.group(4) != null)
            return fstr;	// possible month found in two places

        int idx = -1;
        if (mon != null) {
            String lmon = mon.toLowerCase();
            idx = MONS.indexOf(lmon);
            if (idx == -1)  // not legal month, error
                return fstr;
            date.append(mon);
            date.append(" ");
            idx = idx / 4;
            hm.put("month", MONTHS[idx]);

        } else if (day != null) // day found but not month, error
            return fstr;

        String year = pm.group(3);
        date.append(year);

        StringBuffer note = new StringBuffer();
        if (day != null && !day.equals("0")) {
            note.append("Source Date: ");
            note.append(date);
            note.append(".");
            addNote(hm, note.toString());
        }

        // check if journal year matches PY field
        if (hm.get("year") != null) {
            String oyear = hm.get("year");
            if (!year.equals(oyear)) {
                note.setLength(0);
                note.append("Source Year: ");
                note.append(year);
                note.append(".");
                addNote(hm, note.toString());
//		System.out.println(year + " != " + oyear);
            }
        } else
            hm.put("year", year);

        int len = fstr.length();
        StringBuffer newf = new StringBuffer();
        if (pm.start() > 0)
            newf.append(fstr.substring(0, pm.start()));
        if (pm.end() < len)
            newf.append(fstr.substring(pm.end(), len));
        return newf.toString();
    }

    /**
     * Check whether the source is in the correct format for this importer.
     */
    public boolean isRecognizedFormat(InputStream stream) throws IOException {
        // CSA records start with "DN: Database Name"
        BufferedReader in =
            new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
        String str;
        while ((str = in.readLine()) != null) {
            if (str.equals("DN: Database Name"))
                return true;
        }

        return false;
    }

    /**
     * Parse the entries in the source, and return a List of BibtexEntry
     * objects.
     */
    public List<BibtexEntry> importEntries(InputStream stream) throws IOException {
        ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>();
        StringBuffer sb = new StringBuffer();
        HashMap<String, String> hm = new HashMap<String, String>();

        BufferedReader in =
            new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));

        String Type = null;
        String str;
        boolean first = true;
        line = 1;
        str = readLine(in);
        while (true) {
            if (str == null || str.length() == 0) {	// end of record
                if (!hm.isEmpty()) { // have a record
                    if (Type == null) {
                        addNote(hm, "Publication Type: [NOT SPECIFIED]");
                        addNote(hm, "[PERHAPS NOT FULL FORMAT]");
                        Type = "article";
                    }

                    // post-process Journal article
                    if (Type.equals("article") &&
                        hm.get("booktitle") != null) {
                        String booktitle = hm.get("booktitle");
                        hm.remove("booktitle");
                        hm.put("journal", booktitle);
                    }

                    BibtexEntry b =
                        new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID,
                                        Globals.getEntryType(Type));

                    // create one here
                    b.setField(hm);

                    bibitems.add(b);
                }
                hm.clear();	// ready for next record
                first = true;
                if (str == null)
                    break;	// end of file
                str = readLine(in);
                continue;
            }

            int fline = line;	// save this before reading field contents
            Matcher fm = FIELD_PATTERN.matcher(str);
            if (fm.find()) {

                // save the field name (long and short)
                String fabbr = fm.group(1);
                String fname = fm.group(2);

                // read the contents of the field
                sb.setLength(0); // clear the buffer
                while ((str = readLine(in)) != null) {
                    if (! str.startsWith("    ")) // field contents?
                        break;	// nope
                    if (sb.length() > 0) {
                        sb.append(" ");
                    }
                    sb.append(str.substring(4)); // skip spaces
                }
                String fstr = sb.toString();
                if (fstr == null || fstr.length() == 0) {
                    int line1 = line - 1;
                    throw new IOException("illegal empty field at line " +
                                          line1);
                }
                // replace [Lt] with <
                fm = LT_PATTERN.matcher(fstr);
                if (fm.find())
                    fstr = fm.replaceAll("<");

                // check for start of new record
                if (fabbr.equals("DN") &&
                    fname.equalsIgnoreCase("Database Name")) {
                    if (first == false) {
                        throw new IOException("format error at line " + fline +
                                              ": DN out of order");
                    }
                    first = false;
                } else if (first == true) {
                    throw new IOException("format error at line " + fline +
                                              ": missing DN");
                }

                if (fabbr.equals("PT")) {
                    Type = null;
                    String flow = fstr.toLowerCase();
                    String[] types = flow.split("; ");
                    for (int ii = 0; ii < types.length; ++ii) {
                        if ((types[ii].indexOf("article")>=0) ||
                            (types[ii].indexOf("journal article")>=0)) {
                            Type = "article";
                            break;
                        } else if (types[ii].equals("dissertation")) {
                            Type = "phdthesis";
                            break;
                        } else if (types[ii].equals("conference")) {
                            Type = "inproceedings";
                            break;
                        } else if (types[ii].equals("book monograph") &&
                                   Type == null) {
                            Type = "book";
                            break;
                        } else if (types[ii].equals("report") &&
                                   Type == null) {
                            Type = "techreport";
                            break;
                        }
                    }
                    if (Type == null) {
                        Type = "misc";
                    }

                }

                String ftype = null;
                if (fabbr.equals("AB"))
                    ftype = "abstract";
                else if (fabbr.equals("AF"))
                    ftype = "affiliation";
                else if (fabbr.equals("AU")) {
                    ftype = "author";
                    if (fstr.indexOf(";") >= 0)
                        fstr = fstr.replaceAll("; ", " and ");
                }
                else if (fabbr.equals("CA"))
                    ftype = "organization";
                else if (fabbr.equals("DE"))
                    ftype = "keywords";
                else if (fabbr.equals("DO"))
                    ftype = "doi";
                else if (fabbr.equals("ED"))
                    ftype = "editor";
                else if (fabbr.equals("IB"))
                    ftype = "ISBN";
                else if (fabbr.equals("IS"))
                    ftype = "ISSN";
                else if (fabbr.equals("JN"))
                    ftype = "journal";
                else if (fabbr.equals("LA"))
                    ftype = "language";
                else if (fabbr.equals("PB"))
                    ftype = "publisher";
                else if (fabbr.equals("PY")) {
                    ftype = "year";
                    if (hm.get("year") != null) {
                        String oyear = hm.get("year");
                        if (!fstr.equals(oyear)) {
                            StringBuffer note = new StringBuffer();
                            note.append("Source Year: ");
                            note.append(oyear);
                            note.append(".");
                            addNote(hm, note.toString());
//			    System.out.println(fstr + " != " + oyear);
                        }
                    }
                } else if (fabbr.equals("RL")) {
                    ftype = "url";
                    String[] lines = fstr.split(" ");
                    StringBuffer urls = new StringBuffer();
                    for (int ii = 0; ii < lines.length; ++ii) {
                        if (lines[ii].startsWith("[URL:"))
                            urls.append(lines[ii].substring(5));
                        else if (lines[ii].endsWith("]")) {
                            int len = lines[ii].length();
                            urls.append(lines[ii].substring(0, len - 1));
                            if (ii < lines.length - 1)
                                urls.append("\n");
                        } else
                            urls.append(lines[ii]);
                    }
                    fstr = urls.toString();
                } else if (fabbr.equals("SO")) {
                    ftype = "booktitle";

                    // see if we can extract journal information

                    // compact vol(no):page-page:
                    Matcher pm = VOLNOPP_PATTERN.matcher(fstr);
                    if (pm.find()) {
                        hm.put("volume", pm.group(1));
                        hm.put("number", pm.group(2));
                        hm.put("pages", pm.group(3));
                        fstr = pm.replaceFirst("");
                    }

                    // pages
                    pm = PAGES_PATTERN.matcher(fstr);
                    StringBuffer pages = new StringBuffer();
                    while (pm.find()) {
                        if (pages.length() > 0)
                            pages.append(",");
                        String pp = pm.group(1);
                        if (pp == null)
                            pp = pm.group(2);
                        if (pp == null)
                            pp = pm.group(3);
                        pages.append(pp);
                        fstr = pm.replaceFirst("");
                        pm = PAGES_PATTERN.matcher(fstr);
                    }
                    if (pages.length() > 0)
                        hm.put("pages", pages.toString());

                    // volume:
                    pm = VOLUME_PATTERN.matcher(fstr);
                    if (pm.find()) {
                        hm.put("volume", pm.group(1));
                        fstr = pm.replaceFirst("");
                    }

                    // number:
                    pm = NUMBER_PATTERN.matcher(fstr);
                    if (pm.find()) {
                        hm.put("number", pm.group(1));
                        fstr = pm.replaceFirst("");
                    }

                    // journal date:
                    fstr = parseDate(hm, fstr);

                    // strip trailing whitespace
                    Pattern pp = Pattern.compile(",?\\s*$");
                    pm = pp.matcher(fstr);
                    if (pm.find())
                        fstr = pm.replaceFirst("");

                    if (fstr.equals(""))
                        continue;
//		    System.out.println("SOURCE: \"" + fstr + "\"");
                } else if (fabbr.equals("TI"))
                    ftype = "title";
                else if (fabbr.equals("RE"))
                    continue;	// throw away References

                if (ftype != null) {
                    hm.put(ftype, fstr);
                } else {
                    StringBuffer val = new StringBuffer();
                    val.append(fname);
                    val.append(": ");
                    val.append(fstr);
                    val.append(".");
                    addNote(hm, val.toString());
                }
            } else
                str = readLine(in);
        }

        return bibitems;
    }
}