/* * Created on April 01, 2007 * Updated on May 03, 2007 * */ package net.sf.jabref.msbib; import java.io.StringWriter; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import net.sf.jabref.BibtexEntry; import net.sf.jabref.BibtexEntryType; import net.sf.jabref.BibtexFields; import net.sf.jabref.export.layout.LayoutFormatter; import net.sf.jabref.export.layout.format.XMLChars; import net.sf.jabref.mods.PageNumbers; import net.sf.jabref.mods.PersonName; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * @author S M Mahbub Murshed * @email udvranto@yahoo.com * * @version 2.0.0 * @see http://mahbub.wordpress.com/2007/03/24/details-of-microsoft-office-2007-bibliographic-format-compared-to-bibtex/ * @see http://mahbub.wordpress.com/2007/03/22/deciphering-microsoft-office-2007-bibliography-format/ * * Date: May 15, 2007; May 03, 2007 * * History * May 03, 2007 - Added export functionality * May 15, 2007 - Added import functionality * May 16, 2007 - Changed all interger entries to strings, * except LCID which must be an integer. * To avoid exception during integer parsing * the exception is caught and LCID is set to zero. */ public class MSBibEntry { protected String sourceType = "Misc"; protected String bibTexEntry = null; protected String tag = null; protected String GUID = null; protected int LCID = -1; protected List<PersonName> authors = null; protected List<PersonName> bookAuthors = null; protected List<PersonName> editors = null; protected List<PersonName> translators = null; protected List<PersonName> producerNames = null; protected List<PersonName> composers = null; protected List<PersonName> conductors = null; protected List<PersonName> performers = null; protected List<PersonName> writers = null; protected List<PersonName> directors = null; protected List<PersonName> compilers = null; protected List<PersonName> interviewers = null; protected List<PersonName> interviewees = null; protected List<PersonName> inventors = null; protected List<PersonName> counsels = null; protected String title = null; protected String year = null; protected String month = null; protected String day = null; protected String shortTitle = null; protected String comments = null; protected PageNumbers pages = null; protected String volume = null; protected String numberOfVolumes = null; protected String edition = null; protected String standardNumber = null; protected String publisher = null; protected String address = null; protected String bookTitle = null; protected String chapterNumber = null; protected String journalName = null; protected String issue = null; protected String periodicalTitle = null; protected String conferenceName = null; protected String department = null; protected String institution = null; protected String thesisType = null; protected String internetSiteTitle = null; protected String dateAccessed = null; protected String url = null; protected String productionCompany = null; protected String publicationTitle = null; protected String medium = null; protected String albumTitle = null; protected String recordingNumber = null; protected String theater = null; protected String distributor = null; protected String broadcastTitle = null; protected String broadcaster = null; protected String station = null; protected String type = null; protected String patentNumber = null; protected String court = null; protected String reporter = null; protected String caseNumber = null; protected String abbreviatedCaseNumber = null; protected String bibTex_Series = null; protected String bibTex_Abstract = null; protected String bibTex_KeyWords = null; protected String bibTex_CrossRef = null; protected String bibTex_HowPublished = null; protected String bibTex_Affiliation = null; protected String bibTex_Contents = null; protected String bibTex_Copyright = null; protected String bibTex_Price = null; protected String bibTex_Size = null; /* SM 2010.10 intype, paper support */ protected String bibTex_InType = null; protected String bibTex_Paper = null; private final String BIBTEX = "BIBTEX_"; private final String MSBIB = "msbib-"; private final String bcol = "b:"; private final boolean FORMATXML = false; public MSBibEntry() { } public MSBibEntry(BibtexEntry bibtex) { this(); populateFromBibtex(bibtex); } public MSBibEntry(Element entry, String _bcol) { this(); populateFromXml(entry,_bcol); } protected String getFromXml(String name, Element entry) { String value = null; NodeList nodeLst = entry.getElementsByTagName(name); if(nodeLst.getLength()>0) value = nodeLst.item(0).getTextContent(); return value; } protected void populateFromXml(Element entry, String _bcol) { String temp = null; sourceType = getFromXml(_bcol+"SourceType", entry); tag = getFromXml(_bcol+"Tag", entry); temp = getFromXml(_bcol+"LCID", entry); if(temp!=null) { try { LCID = Integer.parseInt(temp); } catch (Exception e) { LCID = -1; } } title = getFromXml(_bcol+"Title", entry); year = getFromXml(_bcol+"Year", entry); month = getFromXml(_bcol+"Month", entry); day = getFromXml(_bcol+"Day", entry); shortTitle = getFromXml(_bcol+"ShortTitle", entry); comments = getFromXml(_bcol+"Comments", entry); temp = getFromXml(_bcol+"Pages", entry); if(temp != null) pages = new PageNumbers(temp); volume = getFromXml(_bcol+"Volume", entry); numberOfVolumes = getFromXml(_bcol+"NumberVolumes", entry); edition = getFromXml(_bcol+"Edition", entry); standardNumber = getFromXml(_bcol+"StandardNumber", entry); publisher = getFromXml(_bcol+"Publisher", entry); String city = getFromXml(_bcol+"City", entry); String state = getFromXml(_bcol+"StateProvince", entry); String country = getFromXml(_bcol+"CountryRegion", entry); address = ""; if(city != null) address += city + ", "; if(state != null) address += state + " "; if(country != null) address += country; address = address.trim(); if(address.equals("") || address.equals(",")) address = null; bookTitle = getFromXml(_bcol+"BookTitle", entry); chapterNumber = getFromXml(_bcol+"ChapterNumber", entry); journalName = getFromXml(_bcol+"JournalName", entry); issue = getFromXml(_bcol+"Issue", entry); periodicalTitle = getFromXml(_bcol+"PeriodicalTitle", entry); conferenceName = getFromXml(_bcol+"ConferenceName", entry); department = getFromXml(_bcol+"Department", entry); institution = getFromXml(_bcol+"Institution", entry); thesisType = getFromXml(_bcol+"ThesisType", entry); internetSiteTitle = getFromXml(_bcol+"InternetSiteTitle", entry); String month = getFromXml(_bcol+"MonthAccessed", entry); String day = getFromXml(_bcol+"DayAccessed", entry); String year = getFromXml(_bcol+"YearAccessed", entry); dateAccessed = ""; if(month != null) dateAccessed += month + " "; if(day != null) dateAccessed += day + ", "; if(year != null) dateAccessed += year; dateAccessed = dateAccessed.trim(); if(dateAccessed.equals("") || dateAccessed.equals(",")) dateAccessed = null; url = getFromXml(_bcol+"URL", entry); productionCompany = getFromXml(_bcol+"ProductionCompany", entry); publicationTitle = getFromXml(_bcol+"PublicationTitle", entry); medium = getFromXml(_bcol+"Medium", entry); albumTitle = getFromXml(_bcol+"AlbumTitle", entry); recordingNumber = getFromXml(_bcol+"RecordingNumber", entry); theater = getFromXml(_bcol+"Theater", entry); distributor = getFromXml(_bcol+"Distributor", entry); broadcastTitle = getFromXml(_bcol+"BroadcastTitle", entry); broadcaster = getFromXml(_bcol+"Broadcaster", entry); station = getFromXml(_bcol+"Station", entry); type = getFromXml(_bcol+"Type", entry); patentNumber = getFromXml(_bcol+"PatentNumber", entry); court = getFromXml(_bcol+"Court", entry); reporter = getFromXml(_bcol+"Reporter", entry); caseNumber = getFromXml(_bcol+"CaseNumber", entry); abbreviatedCaseNumber = getFromXml(_bcol+"AbbreviatedCaseNumber", entry); bibTex_Series = getFromXml(_bcol+BIBTEX+"Series", entry); bibTex_Abstract = getFromXml(_bcol+BIBTEX+"Abstract", entry); bibTex_KeyWords = getFromXml(_bcol+BIBTEX+"KeyWords", entry); bibTex_CrossRef = getFromXml(_bcol+BIBTEX+"CrossRef", entry); bibTex_HowPublished = getFromXml(_bcol+BIBTEX+"HowPublished", entry); bibTex_Affiliation = getFromXml(_bcol+BIBTEX+"Affiliation", entry); bibTex_Contents = getFromXml(_bcol+BIBTEX+"Contents", entry); bibTex_Copyright = getFromXml(_bcol+BIBTEX+"Copyright", entry); bibTex_Price = getFromXml(_bcol+BIBTEX+"Price", entry); bibTex_Size = getFromXml(_bcol+BIBTEX+"Size", entry); NodeList nodeLst = entry.getElementsByTagName(_bcol+"Author"); if(nodeLst.getLength()>0) getAuthors((Element)(nodeLst.item(0)),_bcol); } protected void populateFromBibtex(BibtexEntry bibtex) { // date = getDate(bibtex); sourceType = getMSBibSourceType(bibtex); if (bibtex.getField("bibtexkey") != null) tag = bibtex.getField("bibtexkey").toString(); if (bibtex.getField("language") != null) LCID = getLCID(bibtex.getField("language").toString()); if (bibtex.getField("title") != null) title = bibtex.getField("title").toString(); if (bibtex.getField("year") != null) year = bibtex.getField("year").toString(); if (bibtex.getField("month") != null) month = bibtex.getField("month").toString(); if (bibtex.getField(MSBIB+"day") != null) day = bibtex.getField(MSBIB+"day").toString(); if (bibtex.getField(MSBIB+"shorttitle") != null) shortTitle = bibtex.getField(MSBIB+"shorttitle").toString(); if (bibtex.getField("note") != null) comments = bibtex.getField("note").toString(); if (bibtex.getField("pages") != null) pages = new PageNumbers(bibtex.getField("pages").toString()); if (bibtex.getField("volume") != null) volume = bibtex.getField("volume").toString(); if (bibtex.getField(MSBIB+"numberofvolume") != null) numberOfVolumes = bibtex.getField(MSBIB+"numberofvolume").toString(); if (bibtex.getField("edition") != null) edition = bibtex.getField("edition").toString(); standardNumber = new String(); if (bibtex.getField("isbn") != null) /* SM: 2010.10: lower case */ standardNumber += " ISBN: " + bibtex.getField("isbn").toString(); /* SM: 2010.10: lower case */ if (bibtex.getField("issn") != null) /* SM: 2010.10: lower case */ standardNumber += " ISSN: "+ bibtex.getField("issn").toString(); /* SM: 2010.10: lower case */ if (bibtex.getField("lccn") != null) /* SM: 2010.10: lower case */ standardNumber += " LCCN: "+ bibtex.getField("lccn").toString(); /* SM: 2010.10: lower case */ if (bibtex.getField("mrnumber") != null) standardNumber += " MRN: "+ bibtex.getField("mrnumber").toString(); /* SM: 2010.10 begin DOI support */ if (bibtex.getField("doi") != null) standardNumber += " DOI: "+ bibtex.getField("doi").toString(); /* SM: 2010.10 end DOI support */ if(standardNumber.equals("")) standardNumber = null; if (bibtex.getField("publisher") != null) publisher = bibtex.getField("publisher").toString(); if (bibtex.getField("address") != null) address = bibtex.getField("address").toString(); if (bibtex.getField("booktitle") != null) bookTitle = bibtex.getField("booktitle").toString(); if (bibtex.getField("chapter") != null) chapterNumber = bibtex.getField("chapter").toString(); if (bibtex.getField("journal") != null) journalName = bibtex.getField("journal").toString(); if (bibtex.getField("number") != null) issue = bibtex.getField("number").toString(); if (bibtex.getField(MSBIB+"periodical") != null) periodicalTitle = bibtex.getField(MSBIB+"periodical").toString(); if (bibtex.getField("organization") != null) conferenceName = bibtex.getField("organization").toString(); if (bibtex.getField("school") != null) department = bibtex.getField("school").toString(); if (bibtex.getField("institution") != null) institution = bibtex.getField("institution").toString(); /* SM: 2010.10 Modified for default source types */ if (bibtex.getField("type") != null) thesisType = bibtex.getField("type").toString(); else { if (bibtex.getType().getName().equalsIgnoreCase("techreport")) thesisType = "Tech. rep."; else if (bibtex.getType().getName().equalsIgnoreCase("mastersthesis")) thesisType = "Master's thesis"; else if (bibtex.getType().getName().equalsIgnoreCase("phdthesis")) thesisType = "Ph.D. dissertation"; else if (bibtex.getType().getName().equalsIgnoreCase("unpublished")) thesisType = "unpublished"; //else if (bibtex.getType().getName().equalsIgnoreCase("manual")) // thesisType = "manual"; } if ( (sourceType.equals("InternetSite")==true || sourceType.equals("DocumentFromInternetSite")==true) && bibtex.getField("title") != null) internetSiteTitle = bibtex.getField("title").toString(); if (bibtex.getField(MSBIB+"accessed") != null) dateAccessed = bibtex.getField(MSBIB+"accessed").toString(); if (bibtex.getField("url") != null) /* SM: 2010.10: lower case */ url = bibtex.getField("url").toString(); /* SM: 2010.10: lower case */ if (bibtex.getField(MSBIB+"productioncompany") != null) productionCompany = bibtex.getField(MSBIB+"productioncompany").toString(); if ( (sourceType.equals("ElectronicSource")==true || sourceType.equals("Art")==true || sourceType.equals("Misc")==true) && bibtex.getField("title") != null) publicationTitle = bibtex.getField("title").toString(); if (bibtex.getField(MSBIB+"medium") != null) medium = bibtex.getField(MSBIB+"medium").toString(); if (sourceType.equals("SoundRecording")==true && bibtex.getField("title") != null) albumTitle = bibtex.getField("title").toString(); if (bibtex.getField(MSBIB+"recordingnumber") != null) recordingNumber = bibtex.getField(MSBIB+"recordingnumber").toString(); if (bibtex.getField(MSBIB+"theater") != null) theater = bibtex.getField(MSBIB+"theater").toString(); if (bibtex.getField(MSBIB+"distributor") != null) distributor = bibtex.getField(MSBIB+"distributor").toString(); if (sourceType.equals("Interview")==true && bibtex.getField("title") != null) broadcastTitle = bibtex.getField("title").toString(); if (bibtex.getField(MSBIB+"broadcaster") != null) broadcaster = bibtex.getField(MSBIB+"broadcaster").toString(); if (bibtex.getField(MSBIB+"station") != null) station = bibtex.getField(MSBIB+"station").toString(); if (bibtex.getField(MSBIB+"type") != null) type = bibtex.getField(MSBIB+"type").toString(); if (bibtex.getField(MSBIB+"patentnumber") != null) patentNumber = bibtex.getField(MSBIB+"patentnumber").toString(); if (bibtex.getField(MSBIB+"court") != null) court = bibtex.getField(MSBIB+"court").toString(); if (bibtex.getField(MSBIB+"reporter") != null) reporter = bibtex.getField(MSBIB+"reporter").toString(); if (bibtex.getField(MSBIB+"casenumber") != null) caseNumber = bibtex.getField(MSBIB+"casenumber").toString(); if (bibtex.getField(MSBIB+"abbreviatedcasenumber") != null) abbreviatedCaseNumber = bibtex.getField(MSBIB+"abbreviatedcasenumber").toString(); if (bibtex.getField("series") != null) bibTex_Series = bibtex.getField("series").toString(); if (bibtex.getField("abstract") != null) bibTex_Abstract = bibtex.getField("abstract").toString(); if (bibtex.getField("keywords") != null) bibTex_KeyWords = bibtex.getField("keywords").toString(); if (bibtex.getField("crossref") != null) bibTex_CrossRef = bibtex.getField("crossref").toString(); if (bibtex.getField("howpublished") != null) bibTex_HowPublished = bibtex.getField("howpublished").toString(); if (bibtex.getField("affiliation") != null) bibTex_Affiliation = bibtex.getField("affiliation").toString(); if (bibtex.getField("contents") != null) bibTex_Contents = bibtex.getField("contents").toString(); if (bibtex.getField("copyright") != null) bibTex_Copyright = bibtex.getField("copyright").toString(); if (bibtex.getField("price") != null) bibTex_Price = bibtex.getField("price").toString(); if (bibtex.getField("size") != null) bibTex_Size = bibtex.getField("size").toString(); /* SM: 2010.10 end intype, paper support */ if (bibtex.getField("intype") != null) bibTex_InType = bibtex.getField("intype").toString(); if (bibtex.getField("paper") != null) bibTex_Paper = bibtex.getField("paper").toString(); if (bibtex.getField("author") != null) authors = getAuthors(bibtex.getField("author").toString()); if (bibtex.getField("editor") != null) editors = getAuthors(bibtex.getField("editor").toString()); if(FORMATXML) { title = format(title); // shortTitle = format(shortTitle); // publisher = format(publisher); // conferenceName = format(conferenceName); // department = format(department); // institution = format(institution); // internetSiteTitle = format(internetSiteTitle); // publicationTitle = format(publicationTitle); // albumTitle = format(albumTitle); // theater = format(theater); // distributor = format(distributor); // broadcastTitle = format(broadcastTitle); // broadcaster = format(broadcaster); // station = format(station); // court = format(court); // reporter = format(reporter); // bibTex_Series = format(bibTex_Series); bibTex_Abstract = format(bibTex_Abstract); } } private String format(String value) { if(value == null) return null; String result = null; LayoutFormatter chars = new XMLChars(); result = chars.format(value); return result; } // http://www.microsoft.com/globaldev/reference/lcid-all.mspx protected int getLCID(String language) { int iLCID = 0; // TODO: add lanaguage to LCID mapping return iLCID; } // http://www.microsoft.com/globaldev/reference/lcid-all.mspx protected String getLanguage(int LCID) { String language = "english"; // TODO: add lanaguage to LCID mapping return language; } protected List<PersonName> getSpecificAuthors(String type, Element authors, String _bcol) { List<PersonName> result = null; NodeList nodeLst = authors.getElementsByTagName(_bcol+type); if(nodeLst.getLength()<=0) return result; nodeLst = ((Element)(nodeLst.item(0))).getElementsByTagName(_bcol+"NameList"); if(nodeLst.getLength()<=0) return result; NodeList person = ((Element)(nodeLst.item(0))).getElementsByTagName(_bcol+"Person"); if(person.getLength()<=0) return result; result = new LinkedList<PersonName>(); for(int i=0;i<person.getLength();i++) { NodeList firstName = ((Element)(person.item(i))).getElementsByTagName(_bcol+"First"); NodeList lastName = ((Element)(person.item(i))).getElementsByTagName(_bcol+"Last"); NodeList middleName = ((Element)(person.item(i))).getElementsByTagName(_bcol+"Middle"); PersonName name = new PersonName(); if(firstName.getLength()>0) name.setFirstname(firstName.item(0).getTextContent()); if(middleName.getLength()>0) name.setMiddlename(middleName.item(0).getTextContent()); if(lastName.getLength()>0) name.setSurname(lastName.item(0).getTextContent()); result.add(name); } return result; } protected void getAuthors(Element authorsElem, String _bcol) { authors = getSpecificAuthors("Author",authorsElem,_bcol); bookAuthors = getSpecificAuthors("BookAuthor",authorsElem,_bcol); editors = getSpecificAuthors("Editor",authorsElem,_bcol); translators = getSpecificAuthors("Translator",authorsElem,_bcol); producerNames = getSpecificAuthors("ProducerName",authorsElem,_bcol); composers = getSpecificAuthors("Composer",authorsElem,_bcol); conductors = getSpecificAuthors("Conductor",authorsElem,_bcol); performers = getSpecificAuthors("Performer",authorsElem,_bcol); writers = getSpecificAuthors("Writer",authorsElem,_bcol); directors = getSpecificAuthors("Director",authorsElem,_bcol); compilers = getSpecificAuthors("Compiler",authorsElem,_bcol); interviewers = getSpecificAuthors("Interviewer",authorsElem,_bcol); interviewees = getSpecificAuthors("Interviewee",authorsElem,_bcol); inventors = getSpecificAuthors("Inventor",authorsElem,_bcol); counsels = getSpecificAuthors("Counsel",authorsElem,_bcol); } protected List<PersonName> getAuthors(String authors) { List<PersonName> result = new LinkedList<PersonName>(); if (authors.indexOf(" and ") == -1) { result.add(new PersonName(authors)); } else { String[] names = authors.split(" and "); for (int i=0; i<names.length; i++) { result.add(new PersonName(names[i])); } } return result; } /* construct a MSBib date object */ protected String getDate(BibtexEntry bibtex) { String result = ""; if (bibtex.getField("year") != null) result += (bibtex.getField("year").toString()); if (bibtex.getField("month") != null) result += "-" + bibtex.getField("month").toString(); return result; } protected String getMSBibSourceType(BibtexEntry bibtex) { String bibtexType = bibtex.getType().getName(); String result = "Misc"; if (bibtexType.equalsIgnoreCase("book")) result = "Book"; else if(bibtexType.equalsIgnoreCase("inbook")) { result = "BookSection"; bibTexEntry = "inbook"; } /* SM 2010.10: generalized */ else if(bibtexType.equalsIgnoreCase("booklet")) { result = "BookSection"; bibTexEntry = "booklet"; } else if(bibtexType.equalsIgnoreCase("incollection")) { result = "BookSection"; bibTexEntry = "incollection"; } else if(bibtexType.equalsIgnoreCase("article")) result = "JournalArticle"; else if(bibtexType.equalsIgnoreCase("inproceedings")) { result = "ConferenceProceedings"; bibTexEntry = "inproceedings"; } /* SM 2010.10: generalized */ else if(bibtexType.equalsIgnoreCase("conference")) { result = "ConferenceProceedings"; bibTexEntry = "conference"; } else if(bibtexType.equalsIgnoreCase("proceedings")) { result = "ConferenceProceedings"; bibTexEntry = "proceedings"; } else if(bibtexType.equalsIgnoreCase("collection")) { result = "ConferenceProceedings"; bibTexEntry = "collection"; } else if(bibtexType.equalsIgnoreCase("techreport")) { result = "Report"; bibTexEntry = "techreport"; } /* SM 2010.10: generalized */ else if(bibtexType.equalsIgnoreCase("manual")) { result = "Report"; bibTexEntry = "manual"; } else if(bibtexType.equalsIgnoreCase("mastersthesis")) { result = "Report"; bibTexEntry = "mastersthesis"; } else if(bibtexType.equalsIgnoreCase("phdthesis")) { result = "Report"; bibTexEntry = "phdthesis"; } else if(bibtexType.equalsIgnoreCase("unpublished")) { result = "Report"; bibTexEntry = "unpublished"; } else if(bibtexType.equalsIgnoreCase("patent")) result = "Patent"; else if(bibtexType.equalsIgnoreCase("misc")) result = "Misc"; /*SM: 2010.10 - bibtex @electronic */ else if(bibtexType.equalsIgnoreCase("electronic")) { result = "Misc"; bibTexEntry = "electronic"; } return result; } public Node getDOMrepresentation() { Node result = null; try { DocumentBuilder d = DocumentBuilderFactory.newInstance().newDocumentBuilder(); result = getDOMrepresentation(d.newDocument()); } catch (Exception e) { throw new Error(e); } return result; } public void addField(Document d,Element parent, String name, String value) { if(value == null) return; Element elem = d.createElement(bcol+name); // elem.appendChild(d.createTextNode(healXML(value))); // Text txt = d.createTextNode(value); // if(!txt.getTextContent().equals(value)) // System.out.println("Values dont match!"); // // throw new Exception("Values dont match!"); // elem.appendChild(txt); elem.appendChild(d.createTextNode(stripNonValidXMLCharacters(value))); parent.appendChild(elem); } public void addAuthor(Document d, Element allAuthors, String entryName, List<PersonName> authorsLst) { if(authorsLst == null) return; Element authorTop = d.createElement(bcol+entryName); Element nameList = d.createElement(bcol+"NameList"); for(Iterator<PersonName> iter = authorsLst.iterator(); iter.hasNext();) { PersonName name = iter.next(); Element person = d.createElement(bcol+"Person"); addField(d, person,"Last",name.getSurname()); addField(d, person,"Middle",name.getMiddlename()); addField(d, person,"First",name.getFirstname()); nameList.appendChild(person); } authorTop.appendChild(nameList); allAuthors.appendChild(authorTop); } public void addAdrress(Document d,Element parent, String address) { if(address == null) return; // US address parser // See documentation here http://regexlib.com/REDetails.aspx?regexp_id=472 // Pattern p = Pattern.compile("^(?n:(((?<address1>(\\d{1,5}(\\ 1\\/[234])?(\\x20[A-Z]([a-z])+)+ )|(P\\.O\\.\\ Box\\ \\d{1,5}))\\s{1,2}(?i:(?<address2>(((APT|B LDG|DEPT|FL|HNGR|LOT|PIER|RM|S(LIP|PC|T(E|OP))|TRLR|UNIT)\\x20\\w{1,5})|(BSMT|FRNT|LBBY|LOWR|OFC|PH|REAR|SIDE|UPPR)\\.?)\\s{1,2})?))?)(?<city>[A-Z]([a-z])+(\\.?)(\\x20[A-Z]([a-z])+){0,2})([,\\x20]+?)(?<state>A[LKSZRAP]|C[AOT]|D[EC]|F[LM]|G[AU]|HI|I[ADL N]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD] |T[NX]|UT|V[AIT]|W[AIVY])([,\\x20]+?)(?<zipcode>(?!0{5})\\d{5}(-\\d {4})?)((([,\\x20]+?)(?<country>[A-Z]([a-z])+(\\.?)(\\x20[A-Z]([a-z])+){0,2}))?))$"); // the pattern above is for C#, may not work with java. Never tested though. // reduced subset, supports only "CITY , STATE, COUNTRY" // \b(\w+)\s?[,]?\s?(\w+)\s?[,]?\s?(\w+)\b // WORD SPACE , SPACE WORD SPACE , SPACE WORD // tested using http://www.javaregex.com/test.html Pattern p = Pattern.compile("\\b(\\w+)\\s*[,]?\\s*(\\w+)\\s*[,]?\\s*(\\w+)\\b"); Matcher m = p.matcher(address); if (m.matches() && m.groupCount()>3) { addField(d, parent,"City",m.group(1)); addField(d, parent,"StateProvince",m.group(2)); addField(d, parent,"CountryRegion",m.group(3)); } else /* SM: 2010.10 generalized */ addField(d, parent,"City",address); } public void addDate(Document d,Element parent, String date, String extra) { if(date == null) return; // Allows 20.3-2007|||20/3- 2007 etc. // (\d{1,2})\s?[.,-/]\s?(\d{1,2})\s?[.,-/]\s?(\d{2,4}) // 1-2 DIGITS SPACE SEPERATOR SPACE 1-2 DIGITS SPACE SEPERATOR SPACE 2-4 DIGITS // tested using http://www.javaregex.com/test.html Pattern p = Pattern.compile("(\\d{1,2})\\s*[.,-/]\\s*(\\d{1,2})\\s*[.,-/]\\s*(\\d{2,4})"); Matcher m = p.matcher(date); if (m.matches() && m.groupCount()>3) { addField(d, parent,"Month"+extra,m.group(1)); addField(d, parent,"Day"+extra,m.group(2)); addField(d, parent,"Year"+extra,m.group(3)); } } public Element getDOMrepresentation(Document d) { try { Element msbibEntry = d.createElement(bcol+"Source"); addField(d,msbibEntry,"SourceType",sourceType); addField(d,msbibEntry,BIBTEX+"Entry",bibTexEntry); addField(d,msbibEntry,"Tag",tag); addField(d,msbibEntry,"GUID",GUID); if(LCID >= 0) addField(d,msbibEntry,"LCID",Integer.toString(LCID)); addField(d,msbibEntry,"Title",title); addField(d,msbibEntry,"Year",year); addField(d,msbibEntry,"ShortTitle",shortTitle); addField(d,msbibEntry,"Comments",comments); Element allAuthors = d.createElement(bcol+"Author"); addAuthor(d,allAuthors,"Author",authors); addAuthor(d,allAuthors,"BookAuthor",bookAuthors); addAuthor(d,allAuthors,"Editor",editors); addAuthor(d,allAuthors,"Translator",translators); addAuthor(d,allAuthors,"ProducerName",producerNames); addAuthor(d,allAuthors,"Composer",composers); addAuthor(d,allAuthors,"Conductor",conductors); addAuthor(d,allAuthors,"Performer",performers); addAuthor(d,allAuthors,"Writer",writers); addAuthor(d,allAuthors,"Director",directors); addAuthor(d,allAuthors,"Compiler",compilers); addAuthor(d,allAuthors,"Interviewer",interviewers); addAuthor(d,allAuthors,"Interviewee",interviewees); addAuthor(d,allAuthors,"Inventor",inventors); addAuthor(d,allAuthors,"Counsel",counsels); msbibEntry.appendChild(allAuthors); if(pages !=null ) addField(d,msbibEntry,"Pages",pages.toString("-")); addField(d,msbibEntry,"Volume",volume); addField(d,msbibEntry,"NumberVolumes",numberOfVolumes); addField(d,msbibEntry,"Edition",edition); addField(d,msbibEntry,"StandardNumber",standardNumber); addField(d,msbibEntry,"Publisher",publisher); addAdrress(d,msbibEntry,address); addField(d,msbibEntry,"BookTitle",bookTitle); addField(d,msbibEntry,"ChapterNumber",chapterNumber); addField(d,msbibEntry,"JournalName",journalName); addField(d,msbibEntry,"Issue",issue); addField(d,msbibEntry,"PeriodicalTitle",periodicalTitle); addField(d,msbibEntry,"ConferenceName",conferenceName); addField(d,msbibEntry,"Department",department); addField(d,msbibEntry,"Institution",institution); addField(d,msbibEntry,"ThesisType",thesisType); addField(d,msbibEntry,"InternetSiteTitle",internetSiteTitle); addDate(d,msbibEntry, dateAccessed, "Accessed"); /* SM 2010.10 added month export */ addField(d,msbibEntry,"Month",month); addField(d,msbibEntry,"URL",url); addField(d,msbibEntry,"ProductionCompany",productionCompany); addField(d,msbibEntry,"PublicationTitle",publicationTitle); addField(d,msbibEntry,"Medium",medium); addField(d,msbibEntry,"AlbumTitle",albumTitle); addField(d,msbibEntry,"RecordingNumber",recordingNumber); addField(d,msbibEntry,"Theater",theater); addField(d,msbibEntry,"Distributor",distributor); addField(d,msbibEntry,"BroadcastTitle",broadcastTitle); addField(d,msbibEntry,"Broadcaster",broadcaster); addField(d,msbibEntry,"Station",station); addField(d,msbibEntry,"Type",type); addField(d,msbibEntry,"PatentNumber",patentNumber); addField(d,msbibEntry,"Court",court); addField(d,msbibEntry,"Reporter",reporter); addField(d,msbibEntry,"CaseNumber",caseNumber); addField(d,msbibEntry,"AbbreviatedCaseNumber",abbreviatedCaseNumber); addField(d,msbibEntry,BIBTEX+"Series",bibTex_Series); addField(d,msbibEntry,BIBTEX+"Abstract",bibTex_Abstract); addField(d,msbibEntry,BIBTEX+"KeyWords",bibTex_KeyWords); addField(d,msbibEntry,BIBTEX+"CrossRef",bibTex_CrossRef); addField(d,msbibEntry,BIBTEX+"HowPublished",bibTex_HowPublished); addField(d,msbibEntry,BIBTEX+"Affiliation",bibTex_Affiliation); addField(d,msbibEntry,BIBTEX+"Contents",bibTex_Contents); addField(d,msbibEntry,BIBTEX+"Copyright",bibTex_Copyright); addField(d,msbibEntry,BIBTEX+"Price",bibTex_Price); addField(d,msbibEntry,BIBTEX+"Size",bibTex_Size); /* SM: 2010.10 end intype, paper support */ addField(d,msbibEntry,BIBTEX+"InType",bibTex_InType); addField(d,msbibEntry,BIBTEX+"Paper",bibTex_Paper); return msbibEntry; } catch (Exception e) { System.out.println("Exception caught..." + e); e.printStackTrace(); throw new Error(e); } // return null; } protected void parseSingleStandardNumber(String type,String bibtype, String standardNum, HashMap<String, String> hm) { // tested using http://www.javaregex.com/test.html Pattern p = Pattern.compile(":"+type+":(.[^:]+)"); Matcher m = p.matcher(standardNum); if (m.matches()) hm.put(bibtype,m.group(1)); } protected void parseStandardNumber(String standardNum, HashMap<String, String> hm) { if(standardNumber == null) return; parseSingleStandardNumber("ISBN","isbn",standardNum,hm); /* SM: 2010.10: lower case */ parseSingleStandardNumber("ISSN","issn",standardNum,hm); /* SM: 2010.10: lower case */ parseSingleStandardNumber("LCCN","lccn",standardNum,hm); /* SM: 2010.10: lower case */ parseSingleStandardNumber("MRN","mrnumber",standardNum,hm); /* SM: 2010.10 begin DOI support */ parseSingleStandardNumber("DOI","doi",standardNum,hm); /* SM: 2010.10 end DOI support */ } public void addAuthor(HashMap<String, String> hm, String type, List<PersonName> authorsLst) { if(authorsLst == null) return; String allAuthors = ""; boolean First = true; for(Iterator<PersonName> iter = authorsLst.iterator(); iter.hasNext();) { PersonName name = iter.next(); if(First == false) allAuthors += " and "; allAuthors += name.getFullname(); First = false; } hm.put(type,allAuthors); } // public String mapMSBibToBibtexTypeString(String msbib) { // String bibtex = "other"; // if(msbib.equals("Book")) // bibtex = "book"; // else if(msbib.equals("BookSection")) // bibtex = "inbook"; // else if(msbib.equals("JournalArticle")) // bibtex = "article"; // else if(msbib.equals("ArticleInAPeriodical")) // bibtex = "article"; // else if(msbib.equals("ConferenceProceedings")) // bibtex = "conference"; // else if(msbib.equals("Report")) // bibtex = "techreport"; // else if(msbib.equals("InternetSite")) // bibtex = "other"; // else if(msbib.equals("DocumentFromInternetSite")) // bibtex = "other"; // else if(msbib.equals("DocumentFromInternetSite")) // bibtex = "other"; // else if(msbib.equals("ElectronicSource")) // bibtex = "other"; // else if(msbib.equals("Art")) // bibtex = "other"; // else if(msbib.equals("SoundRecording")) // bibtex = "other"; // else if(msbib.equals("Performance")) // bibtex = "other"; // else if(msbib.equals("Film")) // bibtex = "other"; // else if(msbib.equals("Interview")) // bibtex = "other"; // else if(msbib.equals("Patent")) // bibtex = "other"; // else if(msbib.equals("Case")) // bibtex = "other"; // else if(msbib.equals("Misc")) // bibtex = "misc"; // else // bibtex = "misc"; // // return bibtex; // } public BibtexEntryType mapMSBibToBibtexType(String msbib) { BibtexEntryType bibtex = BibtexEntryType.OTHER; if(msbib.equals("Book")) bibtex = BibtexEntryType.BOOK; else if(msbib.equals("BookSection")) bibtex = BibtexEntryType.INBOOK; else if(msbib.equals("JournalArticle")) bibtex = BibtexEntryType.ARTICLE; else if(msbib.equals("ArticleInAPeriodical")) bibtex = BibtexEntryType.ARTICLE; else if(msbib.equals("ConferenceProceedings")) bibtex = BibtexEntryType.CONFERENCE; else if(msbib.equals("Report")) bibtex = BibtexEntryType.TECHREPORT; else if(msbib.equals("InternetSite")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("DocumentFromInternetSite")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("DocumentFromInternetSite")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("ElectronicSource")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Art")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("SoundRecording")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Performance")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Film")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Interview")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Patent")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Case")) bibtex = BibtexEntryType.OTHER; else if(msbib.equals("Misc")) bibtex = BibtexEntryType.MISC; else bibtex = BibtexEntryType.MISC; return bibtex; } public BibtexEntry getBibtexRepresentation() { // BibtexEntry entry = new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID, // Globals.getEntryType(mapMSBibToBibtexTypeString(sourceType))); // BibtexEntry entry = new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID, // mapMSBibToBibtexType(sourceType)); BibtexEntry entry = null; if(tag == null) entry = new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID, mapMSBibToBibtexType(sourceType)); else entry = new BibtexEntry(tag, mapMSBibToBibtexType(sourceType)); // id assumes an existing database so don't // Todo: add check for BibTexEntry types // BibtexEntry entry = new BibtexEntry(); // if(sourceType.equals("Book")) // entry.setType(BibtexEntryType.BOOK); // else if(sourceType.equals("BookSection")) // entry.setType(BibtexEntryType.INBOOK); // else if(sourceType.equals("JournalArticle")) // entry.setType(BibtexEntryType.ARTICLE); // else if(sourceType.equals("ArticleInAPeriodical")) // entry.setType(BibtexEntryType.ARTICLE); // else if(sourceType.equals("ConferenceProceedings")) // entry.setType(BibtexEntryType.CONFERENCE); // else if(sourceType.equals("Report")) // entry.setType(BibtexEntryType.TECHREPORT); // else if(sourceType.equals("InternetSite")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("DocumentFromInternetSite")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("DocumentFromInternetSite")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("ElectronicSource")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Art")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("SoundRecording")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Performance")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Film")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Interview")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Patent")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Case")) // entry.setType(BibtexEntryType.OTHER); // else if(sourceType.equals("Misc")) // entry.setType(BibtexEntryType.MISC); // else // entry.setType(BibtexEntryType.MISC); HashMap<String, String> hm = new HashMap<String, String>(); if(tag != null) hm.put("bibtexkey",tag); // if(GUID != null) // hm.put("GUID",GUID); if(LCID >= 0) hm.put("language",getLanguage(LCID)); if(title != null) hm.put("title",title); if(year != null) hm.put("year",year); if(shortTitle != null) hm.put(MSBIB+"shorttitle",shortTitle); if(comments != null) hm.put("note",comments); addAuthor(hm,"author",authors); addAuthor(hm,MSBIB+"bookauthor",bookAuthors); addAuthor(hm,"editor",editors); addAuthor(hm,MSBIB+"translator",translators); addAuthor(hm,MSBIB+"producername",producerNames); addAuthor(hm,MSBIB+"composer",composers); addAuthor(hm,MSBIB+"conductor",conductors); addAuthor(hm,MSBIB+"performer",performers); addAuthor(hm,MSBIB+"writer",writers); addAuthor(hm,MSBIB+"director",directors); addAuthor(hm,MSBIB+"compiler",compilers); addAuthor(hm,MSBIB+"interviewer",interviewers); addAuthor(hm,MSBIB+"interviewee",interviewees); addAuthor(hm,MSBIB+"inventor",inventors); addAuthor(hm,MSBIB+"counsel",counsels); if(pages !=null ) hm.put("pages",pages.toString("--")); if(volume !=null ) hm.put("volume",volume); if(numberOfVolumes !=null ) hm.put(MSBIB+"numberofvolume",numberOfVolumes); if(edition !=null ) hm.put("edition",edition); if(edition !=null ) hm.put("edition",edition); parseStandardNumber(standardNumber,hm); if(publisher !=null ) hm.put("publisher",publisher); if(publisher !=null ) hm.put("publisher",publisher); if(address !=null ) hm.put("address",address); if(bookTitle !=null ) hm.put("booktitle",bookTitle); if(chapterNumber !=null ) hm.put("chapter",chapterNumber); if(journalName !=null ) hm.put("journal",journalName); if(issue !=null ) hm.put("number",issue); if(periodicalTitle !=null ) hm.put("organization",periodicalTitle); if(conferenceName !=null ) hm.put("organization",conferenceName); if(department !=null ) hm.put("school",department); if(institution !=null ) hm.put("institution",institution); // if(thesisType !=null ) // hm.put("type",thesisType); // if(internetSiteTitle !=null ) // hm.put("title",internetSiteTitle); if(dateAccessed !=null ) hm.put(MSBIB+"accessed",dateAccessed); if(url !=null ) hm.put("url",url); if(productionCompany !=null ) hm.put(MSBIB+"productioncompany",productionCompany); // if(publicationTitle !=null ) // hm.put("title",publicationTitle); if(medium !=null ) hm.put(MSBIB+"medium",medium); // if(albumTitle !=null ) // hm.put("title",albumTitle); if(recordingNumber !=null ) hm.put(MSBIB+"recordingnumber",recordingNumber); if(theater !=null ) hm.put(MSBIB+"theater",theater); if(distributor !=null ) hm.put(MSBIB+"distributor",distributor); // if(broadcastTitle !=null ) // hm.put("title",broadcastTitle); if(broadcaster !=null ) hm.put(MSBIB+"broadcaster",broadcaster); if(station !=null ) hm.put(MSBIB+"station",station); if(type !=null ) hm.put(MSBIB+"type",type); if(patentNumber !=null ) hm.put(MSBIB+"patentnumber",patentNumber); if(court !=null ) hm.put(MSBIB+"court",court); if(reporter !=null ) hm.put(MSBIB+"reporter",reporter); if(caseNumber !=null ) hm.put(MSBIB+"casenumber",caseNumber); if(abbreviatedCaseNumber !=null ) hm.put(MSBIB+"abbreviatedcasenumber",abbreviatedCaseNumber); if(bibTex_Series !=null ) hm.put("series",bibTex_Series); if(bibTex_Abstract !=null ) hm.put("abstract",bibTex_Abstract); if(bibTex_KeyWords !=null ) hm.put("keywords",bibTex_KeyWords); if(bibTex_CrossRef !=null ) hm.put("crossref",bibTex_CrossRef); if(bibTex_HowPublished !=null ) hm.put("howpublished",bibTex_HowPublished); if(bibTex_Affiliation !=null ) hm.put("affiliation",bibTex_Affiliation); if(bibTex_Contents !=null ) hm.put("contents",bibTex_Contents); if(bibTex_Copyright !=null ) hm.put("copyright",bibTex_Copyright); if(bibTex_Price !=null ) hm.put("price",bibTex_Price); if(bibTex_Size !=null ) hm.put("size",bibTex_Size); entry.setField(hm); return entry; } /** * This method ensures that the output String has only * valid XML unicode characters as specified by the * XML 1.0 standard. For reference, please see * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the * standard</a>. This method will return an empty * String if the input is null or empty. * * URL: http://cse-mjmcl.cse.bris.ac.uk/blog/2007/02/14/1171465494443.html * * @param in The String whose non-valid characters we want to remove. * @return The in String, stripped of non-valid characters. */ public String stripNonValidXMLCharacters(String in) { StringBuffer out = new StringBuffer(); // Used to hold the output. char current; // Used to reference the current character. if (in == null || ("".equals(in))) return ""; // vacancy test. for (int i = 0; i < in.length(); i++) { current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen. if ((current == 0x9) || (current == 0xA) || (current == 0xD) || ((current >= 0x20) && (current <= 0xD7FF)) || ((current >= 0xE000) && (current <= 0xFFFD)) || ((current >= 0x10000) && (current <= 0x10FFFF))) out.append(current); } return out.toString(); } /* * render as XML * * TODO This is untested. */ public String toString() { StringWriter sresult = new StringWriter(); try { DOMSource source = new DOMSource(getDOMrepresentation()); StreamResult result = new StreamResult(sresult); Transformer trans = TransformerFactory.newInstance().newTransformer(); trans.setOutputProperty(OutputKeys.INDENT, "yes"); trans.transform(source, result); } catch (Exception e) { throw new Error(e); } return sresult.toString(); } }