package org.jabref.logic.importer.fileformat; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.regex.Pattern; import org.jabref.logic.importer.Importer; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.util.FileExtensions; import org.jabref.logic.util.OS; import org.jabref.model.entry.AuthorList; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; /** * Importer for the MEDLINE Plain format. * * check here for details on the format * http://www.nlm.nih.gov/bsd/mms/medlineelements.html * * @author vegeziel */ public class MedlinePlainImporter extends Importer { private static final Pattern PMID_PATTERN = Pattern.compile("PMID.*-.*"); private static final Pattern PMC_PATTERN = Pattern.compile("PMC.*-.*"); private static final Pattern PMCR_PATTERN = Pattern.compile("PMCR.*-.*"); private static final Pattern CREATE_DATE_PATTERN = Pattern.compile("\\d{4}/[0123]?\\d/\\s?[012]\\d:[0-5]\\d"); private static final Pattern COMPLETE_DATE_PATTERN = Pattern.compile("\\d{8}"); @Override public String getName() { return "Medline/PubMed Plain"; } @Override public FileExtensions getExtensions() { return FileExtensions.MEDLINE_PLAIN; } @Override public String getDescription() { return "Importer for the MedlinePlain format."; } @Override public String getId() { return "medlineplain"; } @Override public boolean isRecognizedFormat(BufferedReader reader) throws IOException { // Our strategy is to look for the "PMID - *", "PMC.*-.*", or "PMCR.*-.*" line // (i.e., PubMed Unique Identifier, PubMed Central Identifier, PubMed Central Release) String str; while ((str = reader.readLine()) != null) { if (PMID_PATTERN.matcher(str).find() || PMC_PATTERN.matcher(str).find() || PMCR_PATTERN.matcher(str).find()) { return true; } } return false; } @Override public ParserResult importDatabase(BufferedReader reader) throws IOException { List<BibEntry> bibitems = new ArrayList<>(); //use optional here, so that no exception will be thrown if the file is empty Optional<String> OptionalLines = reader.lines().reduce((line, nextline) -> line + "\n" + nextline); String linesAsString = OptionalLines.isPresent() ? OptionalLines.get() : ""; String[] entries = linesAsString.replace("\u2013", "-").replace("\u2014", "--").replace("\u2015", "--") .split("\\n\\n"); for (String entry1 : entries) { if (entry1.trim().isEmpty() || !entry1.contains("-")) { continue; } String type = BibEntry.DEFAULT_TYPE; String author = ""; String editor = ""; String comment = ""; Map<String, String> fields = new HashMap<>(); String[] lines = entry1.split("\n"); for (int j = 0; j < lines.length; j++) { StringBuilder current = new StringBuilder(lines[j]); boolean done = false; while (!done && (j < (lines.length - 1))) { if (lines[j + 1].length() <= 4) { j++; continue; } if (lines[j + 1].charAt(4) != '-') { if ((current.length() > 0) && !Character.isWhitespace(current.charAt(current.length() - 1))) { current.append(' '); } current.append(lines[j + 1].trim()); j++; } else { done = true; } } String entry = current.toString(); if (!checkLineValidity(entry)) { continue; } String label = entry.substring(0, entry.indexOf('-')).trim(); String value = entry.substring(entry.indexOf('-') + 1).trim(); if ("PT".equals(label)) { type = addSourceType(value, type); } addDates(fields, label, value); addAbstract(fields, label, value); addTitles(fields, label, value, type); addIDs(fields, label, value); addStandardNumber(fields, label, value); if ("FAU".equals(label)) { if ("".equals(author)) { author = value; } else { author += " and " + value; } } else if ("FED".equals(label)) { if ("".equals(editor)) { editor = value; } else { editor += " and " + value; } } //store the fields in a map Map<String, String> hashMap = new HashMap<>(); hashMap.put("PG", FieldName.PAGES); hashMap.put("PL", FieldName.ADDRESS); hashMap.put("PHST", "history"); hashMap.put("PST", "publication-status"); hashMap.put("VI", FieldName.VOLUME); hashMap.put("LA", FieldName.LANGUAGE); hashMap.put("PUBM", "model"); hashMap.put("RN", "registry-number"); hashMap.put("NM", "substance-name"); hashMap.put("OCI", "copyright-owner"); hashMap.put("CN", "corporate"); hashMap.put("IP", FieldName.ISSUE); hashMap.put("EN", FieldName.EDITION); hashMap.put("GS", "gene-symbol"); hashMap.put("GN", FieldName.NOTE); hashMap.put("GR", "grantno"); hashMap.put("SO", "source"); hashMap.put("NR", "number-of-references"); hashMap.put("SFM", "space-flight-mission"); hashMap.put("STAT", "status"); hashMap.put("SB", "subset"); hashMap.put("OTO", "termowner"); hashMap.put("OWN", FieldName.OWNER); //add the fields to hm for (Map.Entry<String, String> mapEntry : hashMap.entrySet()) { String medlineKey = mapEntry.getKey(); String bibtexKey = mapEntry.getValue(); if (medlineKey.equals(label)) { fields.put(bibtexKey, value); } } if ("IRAD".equals(label) || "IR".equals(label) || "FIR".equals(label)) { String oldInvestigator = fields.get("investigator"); if (oldInvestigator == null) { fields.put("investigator", value); } else { fields.put("investigator", oldInvestigator + ", " + value); } } else if ("MH".equals(label) || "OT".equals(label)) { if (!fields.containsKey(FieldName.KEYWORDS)) { fields.put(FieldName.KEYWORDS, value); } else { String kw = fields.get(FieldName.KEYWORDS); fields.put(FieldName.KEYWORDS, kw + ", " + value); } } else if ("CON".equals(label) || "CIN".equals(label) || "EIN".equals(label) || "EFR".equals(label) || "CRI".equals(label) || "CRF".equals(label) || "PRIN".equals(label) || "PROF".equals(label) || "RPI".equals(label) || "RPF".equals(label) || "RIN".equals(label) || "ROF".equals(label) || "UIN".equals(label) || "UOF".equals(label) || "SPIN".equals(label) || "ORI".equals(label)) { if (!comment.isEmpty()) { comment = comment + "\n"; } comment = comment + value; } } fixAuthors(fields, author, FieldName.AUTHOR); fixAuthors(fields, editor, FieldName.EDITOR); if (!comment.isEmpty()) { fields.put(FieldName.COMMENT, comment); } BibEntry b = new BibEntry(type); // Remove empty fields: fields.entrySet().stream().filter(n -> n.getValue().trim().isEmpty()).forEach(fields::remove); // create one here b.setField(fields); bibitems.add(b); } return new ParserResult(bibitems); } private boolean checkLineValidity(String line) { return (line.length() >= 5) && (line.charAt(4) == '-'); } private String addSourceType(String value, String type) { String val = value.toLowerCase(Locale.ENGLISH); String theType = type; switch (val) { case "book": theType = "book"; break; case "journal article": case "classical article": case "corrected and republished article": case "historical article": case "introductory journal article": case "newspaper article": theType = "article"; break; case "clinical conference": case "consensus development conference": case "consensus development conference, nih": theType = "conference"; break; case "technical report": theType = "techreport"; break; case "editorial": theType = "inproceedings"; break; case "overall": theType = "proceedings"; break; default: break; } if ("".equals(theType)) { theType = "other"; } return theType; } private void addStandardNumber(Map<String, String> hm, String lab, String value) { if ("IS".equals(lab)) { String key = FieldName.ISSN; //it is possible to have two issn, one for electronic and for print //if there are two then it comes at the end in brackets (electronic) or (print) //so search for the brackets if (value.indexOf('(') > 0) { int keyStart = value.indexOf('('); int keyEnd = value.indexOf(')'); key = value.substring(keyStart + 1, keyEnd) + "-" + key; String numberValue = value.substring(0, keyStart - 1); hm.put(key, numberValue); } else { hm.put(key, value); } } else if ("ISBN".equals(lab)) { hm.put(FieldName.ISBN, value); } } private void fixAuthors(Map<String, String> hm, String author, String field) { if (!author.isEmpty()) { String fixedAuthor = AuthorList.fixAuthorLastNameFirst(author); hm.put(field, fixedAuthor); } } private void addIDs(Map<String, String> hm, String lab, String value) { if ("AID".equals(lab)) { String key = "article-id"; String idValue = value; if (value.startsWith("doi:")) { idValue = idValue.replaceAll("(?i)doi:", "").trim(); key = FieldName.DOI; } else if (value.indexOf('[') > 0) { int startOfIdentifier = value.indexOf('['); int endOfIdentifier = value.indexOf(']'); key = "article-" + value.substring(startOfIdentifier + 1, endOfIdentifier); idValue = value.substring(0, startOfIdentifier - 1); } hm.put(key, idValue); } else if ("LID".equals(lab)) { hm.put("location-id", value); } else if ("MID".equals(lab)) { hm.put("manuscript-id", value); } else if ("JID".equals(lab)) { hm.put("nlm-unique-id", value); } else if ("OID".equals(lab)) { hm.put("other-id", value); } else if ("SI".equals(lab)) { hm.put("second-id", value); } } private void addTitles(Map<String, String> hm, String lab, String val, String type) { if ("TI".equals(lab)) { String oldVal = hm.get(FieldName.TITLE); if (oldVal == null) { hm.put(FieldName.TITLE, val); } else { if (oldVal.endsWith(":") || oldVal.endsWith(".") || oldVal.endsWith("?")) { hm.put(FieldName.TITLE, oldVal + " " + val); } else { hm.put(FieldName.TITLE, oldVal + ": " + val); } } } else if ("BTI".equals(lab) || "CTI".equals(lab)) { hm.put(FieldName.BOOKTITLE, val); } else if ("JT".equals(lab)) { if ("inproceedings".equals(type)) { hm.put(FieldName.BOOKTITLE, val); } else { hm.put(FieldName.JOURNAL, val); } } else if ("CTI".equals(lab)) { hm.put("collection-title", val); } else if ("TA".equals(lab)) { hm.put("title-abbreviation", val); } else if ("TT".equals(lab)) { hm.put("transliterated-title", val); } else if ("VTI".equals(lab)) { hm.put("volume-title", val); } } private void addAbstract(Map<String, String> hm, String lab, String value) { String abstractValue = ""; if ("AB".equals(lab)) { //adds copyright information that comes at the end of an abstract if (value.contains("Copyright")) { int copyrightIndex = value.lastIndexOf("Copyright"); //remove the copyright from the field since the name of the field is copyright String copyrightInfo = value.substring(copyrightIndex, value.length()).replaceAll("Copyright ", ""); hm.put("copyright", copyrightInfo); abstractValue = value.substring(0, copyrightIndex); } else { abstractValue = value; } String oldAb = hm.get(FieldName.ABSTRACT); if (oldAb == null) { hm.put(FieldName.ABSTRACT, abstractValue); } else { hm.put(FieldName.ABSTRACT, oldAb + OS.NEWLINE + abstractValue); } } else if ("OAB".equals(lab) || "OABL".equals(lab)) { hm.put("other-abstract", value); } } private void addDates(Map<String, String> hm, String lab, String val) { if ("CRDT".equals(lab) && isCreateDateFormat(val)) { hm.put("create-date", val); } else if ("DEP".equals(lab) && isDateFormat(val)) { hm.put("electronic-publication", val); } else if ("DA".equals(lab) && isDateFormat(val)) { hm.put("date-created", val); } else if ("DCOM".equals(lab) && isDateFormat(val)) { hm.put("completed", val); } else if ("LR".equals(lab) && isDateFormat(val)) { hm.put("revised", val); } else if ("DP".equals(lab)) { String[] parts = val.split(" "); hm.put(FieldName.YEAR, parts[0]); if ((parts.length > 1) && !parts[1].isEmpty()) { hm.put(FieldName.MONTH, parts[1]); } } else if ("EDAT".equals(lab) && isCreateDateFormat(val)) { hm.put("publication", val); } else if ("MHDA".equals(lab) && isCreateDateFormat(val)) { hm.put("mesh-date", val); } } private boolean isCreateDateFormat(String value) { return CREATE_DATE_PATTERN.matcher(value).matches(); } private boolean isDateFormat(String value) { return COMPLETE_DATE_PATTERN.matcher(value).matches(); } }