package org.jabref.logic.msbib;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jabref.model.entry.Author;
import org.jabref.model.entry.AuthorList;
import org.jabref.model.strings.StringUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* MSBib entry representation
*
* @see <a href="http://mahbub.wordpress.com/2007/03/24/details-of-microsoft-office-2007-bibliographic-format-compared-to-bibtex/">ms office 2007 bibliography format compared to bibtex</a>
* @see <a href="http://mahbub.wordpress.com/2007/03/22/deciphering-microsoft-office-2007-bibliography-format/">deciphering ms office 2007 bibliography format</a>
* @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm">ECMA Standard</a>
*/
class MSBibEntry {
/**
* Allows 20.3-2007|||20/3- 2007 etc.
* <b>(\d{1,2})\s?[.,-/]\s?(\d{1,2})\s?[.,-/]\s?(\d{2,4})</b>
* 1-2 DIGITS SPACE SEPERATOR SPACE 1-2 DIGITS SPACE SEPERATOR SPACE 2-4 DIGITS
*/
private static final Pattern DATE_PATTERN = Pattern
.compile("(\\d{1,2})\\s*[.,-/]\\s*(\\d{1,2})\\s*[.,-/]\\s*(\\d{2,4})");
// MSBib fields and values
public Map<String, String> fields = new HashMap<>();
public List<MsBibAuthor> authors;
public List<MsBibAuthor> bookAuthors;
public List<MsBibAuthor> editors;
public List<MsBibAuthor> translators;
public List<MsBibAuthor> producerNames;
public List<MsBibAuthor> composers;
public List<MsBibAuthor> conductors;
public List<MsBibAuthor> performers;
public List<MsBibAuthor> writers;
public List<MsBibAuthor> directors;
public List<MsBibAuthor> compilers;
public List<MsBibAuthor> interviewers;
public List<MsBibAuthor> interviewees;
public List<MsBibAuthor> inventors;
public List<MsBibAuthor> counsels;
public PageNumbers pages;
public String standardNumber;
public String address;
public String conferenceName;
public String thesisType;
public String internetSiteTitle;
public String dateAccessed;
public String publicationTitle;
public String albumTitle;
public String broadcastTitle;
public String year;
public String month;
public String day;
public String number;
public String patentNumber;
public String journalName;
private String bibtexEntryType;
/**
* reduced subset, supports only "CITY , STATE, COUNTRY" <br>
* <b>\b(\w+)\s?[,]?\s?(\w+)\s?[,]?\s?(\w*)\b</b> <br>
* WORD SPACE , SPACE WORD SPACE (Can be zero or more) , SPACE WORD (Can be zero or more) <br>
* Matches both single locations (only city) like Berlin and full locations like Stroudsburg, PA, USA <br>
* tested using http://www.regexpal.com/
*/
private final Pattern ADDRESS_PATTERN = Pattern.compile("\\b(\\w+)\\s?[,]?\\s?(\\w*)\\s?[,]?\\s?(\\w*)\\b");
public MSBibEntry() {
//empty
}
/**
* Createa new {@link MsBibEntry} to import from an xml element
* @param entry
*/
public MSBibEntry(Element entry) {
populateFromXml(entry);
}
public String getType() {
return fields.get("SourceType");
}
public String getCiteKey() {
return fields.get("Tag");
}
private String getXmlElementTextContent(String name, Element entry) {
String value = null;
NodeList nodeLst = entry.getElementsByTagNameNS("*", name);
if (nodeLst.getLength() > 0) {
value = nodeLst.item(0).getTextContent();
}
return value;
}
private void populateFromXml(Element entry) {
for (int i = 0; i < entry.getChildNodes().getLength(); i++) {
Node node = entry.getChildNodes().item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
String key = node.getLocalName();
String value = node.getTextContent();
if ("SourceType".equals(key)) {
this.bibtexEntryType = value;
}
fields.put(key, value);
}
}
String temp = getXmlElementTextContent("Pages", entry);
if (temp != null) {
pages = new PageNumbers(temp);
}
standardNumber = getXmlElementTextContent("StandardNumber", entry);
conferenceName = getXmlElementTextContent("ConferenceName", entry);
String city = getXmlElementTextContent("City", entry);
String state = getXmlElementTextContent("StateProvince", entry);
String country = getXmlElementTextContent("CountryRegion", entry);
StringBuilder addressBuffer = new StringBuilder();
if (city != null) {
addressBuffer.append(city);
}
if (((state != null) && !state.isEmpty()) && ((city != null) && !city.isEmpty())) {
addressBuffer.append(",").append(' ');
addressBuffer.append(state);
}
if ((country != null) && !country.isEmpty()) {
addressBuffer.append(",").append(' ');
addressBuffer.append(country);
}
address = addressBuffer.toString().trim();
if (address.isEmpty() || ",".equals(address)) {
address = null;
}
if ("Patent".equalsIgnoreCase(bibtexEntryType)) {
number = getXmlElementTextContent("PatentNumber", entry);
}
journalName = getXmlElementTextContent("JournalName", entry);
month = getXmlElementTextContent("Month", entry);
internetSiteTitle = getXmlElementTextContent("InternetSiteTitle", entry);
String monthAccessed = getXmlElementTextContent("MonthAccessed", entry);
String dayAccessed = getXmlElementTextContent("DayAccessed", entry);
String yearAccessed = getXmlElementTextContent("YearAccessed", entry);
StringBuilder sbDateAccesed = new StringBuilder();
if (monthAccessed != null) {
sbDateAccesed.append(monthAccessed);
sbDateAccesed.append(' ');
}
if (dayAccessed != null) {
sbDateAccesed.append(dayAccessed);
sbDateAccesed.append(", ");
}
if (yearAccessed != null) {
sbDateAccesed.append(yearAccessed);
}
dateAccessed = sbDateAccesed.toString().trim();
if (dateAccessed.isEmpty() || ",".equals(dateAccessed)) {
dateAccessed = null;
}
NodeList nodeLst = entry.getElementsByTagNameNS("*", "Author");
if (nodeLst.getLength() > 0) {
getAuthors((Element) nodeLst.item(0));
}
}
private void getAuthors(Element authorsElem) {
authors = getSpecificAuthors("Author", authorsElem);
bookAuthors = getSpecificAuthors("BookAuthor", authorsElem);
editors = getSpecificAuthors("Editor", authorsElem);
translators = getSpecificAuthors("Translator", authorsElem);
producerNames = getSpecificAuthors("ProducerName", authorsElem);
composers = getSpecificAuthors("Composer", authorsElem);
conductors = getSpecificAuthors("Conductor", authorsElem);
performers = getSpecificAuthors("Performer", authorsElem);
writers = getSpecificAuthors("Writer", authorsElem);
directors = getSpecificAuthors("Director", authorsElem);
compilers = getSpecificAuthors("Compiler", authorsElem);
interviewers = getSpecificAuthors("Interviewer", authorsElem);
interviewees = getSpecificAuthors("Interviewee", authorsElem);
inventors = getSpecificAuthors("Inventor", authorsElem);
counsels = getSpecificAuthors("Counsel", authorsElem);
}
private List<MsBibAuthor> getSpecificAuthors(String type, Element authors) {
List<MsBibAuthor> result = null;
NodeList nodeLst = authors.getElementsByTagNameNS("*", type);
if (nodeLst.getLength() <= 0) {
return result;
}
nodeLst = ((Element) nodeLst.item(0)).getElementsByTagNameNS("*", "NameList");
if (nodeLst.getLength() <= 0) {
return result;
}
NodeList person = ((Element) nodeLst.item(0)).getElementsByTagNameNS("*", "Person");
if (person.getLength() <= 0) {
return result;
}
result = new LinkedList<>();
for (int i = 0; i < person.getLength(); i++) {
NodeList firstName = ((Element) person.item(i)).getElementsByTagNameNS("*", "First");
NodeList lastName = ((Element) person.item(i)).getElementsByTagNameNS("*", "Last");
NodeList middleName = ((Element) person.item(i)).getElementsByTagNameNS("*", "Middle");
StringBuilder sb = new StringBuilder();
if (firstName.getLength() > 0) {
sb.append(firstName.item(0).getTextContent());
sb.append(" ");
}
if (middleName.getLength() > 0) {
sb.append(middleName.item(0).getTextContent());
sb.append(" ");
}
if (lastName.getLength() > 0) {
sb.append(lastName.item(0).getTextContent());
}
AuthorList authorList = AuthorList.parse(sb.toString());
for (Author author : authorList.getAuthors()) {
result.add(new MsBibAuthor(author));
}
}
return result;
}
/**
* Gets the dom representation for one entry, used for export
* @param document XmlDocument
* @return XmlElement represenation of one entry
*/
public Element getEntryDom(Document document) {
Element rootNode = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + "Source");
for (Map.Entry<String, String> entry : fields.entrySet()) {
addField(document, rootNode, entry.getKey(), entry.getValue());
}
// based on bibtex content
if (dateAccessed != null) {
Matcher matcher = DATE_PATTERN.matcher(dateAccessed);
if (matcher.matches() && (matcher.groupCount() >= 3)) {
addField(document, rootNode, "Month" + "Accessed", matcher.group(1));
addField(document, rootNode, "Day" + "Accessed", matcher.group(2));
addField(document, rootNode, "Year" + "Accessed", matcher.group(3));
}
}
Element allAuthors = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + "Author");
addAuthor(document, allAuthors, "Author", authors);
addAuthor(document, allAuthors, "BookAuthor", bookAuthors);
addAuthor(document, allAuthors, "Editor", editors);
addAuthor(document, allAuthors, "Translator", translators);
addAuthor(document, allAuthors, "ProducerName", producerNames);
addAuthor(document, allAuthors, "Composer", composers);
addAuthor(document, allAuthors, "Conductor", conductors);
addAuthor(document, allAuthors, "Performer", performers);
addAuthor(document, allAuthors, "Writer", writers);
addAuthor(document, allAuthors, "Director", directors);
addAuthor(document, allAuthors, "Compiler", compilers);
addAuthor(document, allAuthors, "Interviewer", interviewers);
addAuthor(document, allAuthors, "Interviewee", interviewees);
addAuthor(document, allAuthors, "Inventor", inventors);
addAuthor(document, allAuthors, "Counsel", counsels);
rootNode.appendChild(allAuthors);
if (pages != null) {
addField(document, rootNode, "Pages", pages.toString("-"));
}
addField(document, rootNode, "Year", year);
addField(document, rootNode, "Month", month);
addField(document, rootNode, "Day", day);
addField(document, rootNode, "JournalName", journalName);
addField(document, rootNode, "PatentNumber", patentNumber);
addField(document, rootNode, "Number", number);
addField(document, rootNode, "StandardNumber", standardNumber);
addField(document, rootNode, "ConferenceName", conferenceName);
addAddress(document, rootNode, address);
addField(document, rootNode, "ThesisType", thesisType);
addField(document, rootNode, "InternetSiteTitle", internetSiteTitle);
addField(document, rootNode, "PublicationTitle", publicationTitle);
addField(document, rootNode, "AlbumTitle", albumTitle);
addField(document, rootNode, "BroadcastTitle", broadcastTitle);
return rootNode;
}
private void addField(Document document, Element parent, String name, String value) {
if (value == null) {
return;
}
Element elem = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + name);
elem.appendChild(document.createTextNode(StringUtil.stripNonValidXMLCharacters(value)));
parent.appendChild(elem);
}
//Add authors for export
private void addAuthor(Document document, Element allAuthors, String entryName, List<MsBibAuthor> authorsLst) {
if (authorsLst == null) {
return;
}
Element authorTop = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + entryName);
Optional<MsBibAuthor> personName = authorsLst.stream().filter(MsBibAuthor::isCorporate)
.findFirst();
if (personName.isPresent()) {
MsBibAuthor person = personName.get();
Element corporate = document.createElementNS(MSBibDatabase.NAMESPACE,
MSBibDatabase.PREFIX + "Corporate");
corporate.setTextContent(person.getFirstLast());
authorTop.appendChild(corporate);
} else {
Element nameList = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + "NameList");
for (MsBibAuthor name : authorsLst) {
Element person = document.createElementNS(MSBibDatabase.NAMESPACE, MSBibDatabase.PREFIX + "Person");
addField(document, person, "Last", name.getLastName());
addField(document, person, "Middle", name.getMiddleName());
addField(document, person, "First", name.getFirstName());
nameList.appendChild(person);
}
authorTop.appendChild(nameList);
}
allAuthors.appendChild(authorTop);
}
private void addAddress(Document document, Element parent, String addressToSplit) {
if (addressToSplit == null) {
return;
}
Matcher matcher = ADDRESS_PATTERN.matcher(addressToSplit);
if (matcher.matches() && (matcher.groupCount() >= 3)) {
addField(document, parent, "City", matcher.group(1));
addField(document, parent, "StateProvince", matcher.group(2));
addField(document, parent, "CountryRegion", matcher.group(3));
} else {
addField(document, parent, "City", addressToSplit);
}
}
}