/* * This file is part of NixNote * Copyright 2009 Randy Baumgarte * * This file may be licensed under the terms of of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See the GPL for the specific language * governing rights and limitations. * * You should have received a copy of the GPL along with this * program. If not, go to http://www.gnu.org/licenses/gpl.html * or write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ package cx.fbn.nevernote.evernote; //********************************************** //********************************************** //* This is used to turn HTML into ENML compliant //* data. //********************************************** //********************************************** import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.util.ArrayList; import java.util.List; import org.w3c.tidy.Tidy; import org.w3c.tidy.TidyMessage; import com.trolltech.qt.core.QByteArray; import com.trolltech.qt.core.QTextCodec; import cx.fbn.nevernote.Global; import cx.fbn.nevernote.utilities.ApplicationLogger; import cx.fbn.nevernote.utilities.Pair; import cx.fbn.nevernote.xml.XMLCleanup; import cx.fbn.nevernote.xml.XMLNoteRepair; public class EnmlConverter { private final ApplicationLogger logger; private List<String> resources; public boolean saveInvalidXML; private class TidyListener implements org.w3c.tidy.TidyMessageListener { ApplicationLogger logger; public boolean errorFound; public TidyListener(ApplicationLogger logger) { this.logger = logger; errorFound = false; } @Override public void messageReceived(TidyMessage msg) { if (msg.getLevel() == TidyMessage.Level.ERROR) { logger.log(logger.LOW, "******* JTIDY ERORR *******"); logger.log(logger.LOW, "Error Code: " +msg.getErrorCode()); logger.log(logger.LOW, "Column: " +msg.getColumn()); logger.log(logger.LOW, "Column: " +msg.getColumn()); logger.log(logger.LOW, "Line: " +msg.getLine()); logger.log(logger.LOW, "Message: " +msg.getMessage()); logger.log(logger.LOW, "***************************"); errorFound = true; } else logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage()); } } public EnmlConverter(ApplicationLogger l) { logger = l; // conn = c; saveInvalidXML = false; resources = new ArrayList<String>(); } public List<String> getResources() { return resources; } public String convert(String noteGuid, String content) { logger.log(logger.HIGH, "Entering DBRunner.convertToEnml"); logger.log(logger.EXTREME, "Note Text:" +content); // Replace the en-note tags with body tags in case we came from // someplace other than the editor (for example, if we are merging notes). content = content.replace("<en-note>", "<body>"); content = content.replace("</en-note>", "</body>"); // Start removing stuff we don't need or want int br = content.lastIndexOf("</body>"); if (br > 0) content = new String(content.substring(0,br)); String newContent; int k = content.indexOf("<body"); if (k>-1) newContent = new String(content.substring(k)); else newContent = "<body>"+content; // Check that we have a vaild header. Normally we should not // but sometimes it seems that we can. I don't see how, but it is // easy enough to check. if (!newContent.startsWith("<?xml")) newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n" +newContent +"</body>"; // Fix the more common XML problems that Webkit creates, but are not considered // valid XML. newContent = fixStupidXMLProblems(newContent); // Change the contents to have enml instead of body tags or // we'll fail validation later. newContent = newContent.replace("<body", "<en-note"); newContent = newContent.replace("</body>", "</en-note>"); // First pass through the data. The goal of this pass is to // validate that we have a good XML document and to repair // any problems found. XMLNoteRepair repair = new XMLNoteRepair(); // logger.log(logger.HIGH, "Checking XML Structure"); // newContent = repair.parse(newContent, false); // logger.log(logger.HIGH, "Check complete"); logger.log(logger.HIGH, "Fixing encryption tags"); newContent = fixEncryptionTags(newContent); Tidy tidy = new Tidy(); TidyListener tidyListener = new TidyListener(logger); tidy.setMessageListener(tidyListener); tidy.getStderr().close(); // the listener will capture messages tidy.setXmlTags(true); tidy.setXHTML(true); QTextCodec codec; codec = QTextCodec.codecForName("UTF-8"); QByteArray unicode = codec.fromUnicode(newContent); // byte html[] = newContent.getBytes(); // ByteArrayInputStream is = new ByteArrayInputStream(html); logger.log(logger.HIGH, "Starting JTidy check"); logger.log(logger.EXTREME, "Start of JTidy Input"); logger.log(logger.EXTREME, newContent); logger.log(logger.EXTREME, "End Of JTidy Input"); ByteArrayInputStream is = new ByteArrayInputStream(unicode.toByteArray()); ByteArrayOutputStream os = new ByteArrayOutputStream(); tidy.setInputEncoding("UTF-8"); tidy.parse(is, os); String tidyContent = os.toString(); if (tidyListener.errorFound) { logger.log(logger.LOW, "Note Contents Begin"); logger.log(logger.LOW, content); logger.log(logger.LOW, "Note Contents End"); tidyContent = null; } else { if (newContent.trim().equals("")) tidyContent = null; } // If the repair above returned null, then the XML is foobar. // We are done here. if (tidyContent != null) { newContent = tidyContent; } else { // Houston, we've had a problem. Fall back to old method logger.log(logger.HIGH, "Error converting to JTidy. Falling back to old method"); String repairedContent = repair.parse(newContent, false); if (repairedContent == null) { logger.log(logger.EXTREME, "Null returned from repair.parse()"); logger.log(logger.LOW, "Parse error when converting to ENML. Aborting save"); return null; } newContent = repairedContent; logger.log(logger.EXTREME, "Start of repaired content"); logger.log(logger.EXTREME, repairedContent); logger.log(logger.EXTREME, "End of repaired content"); } // Second pass through the data. The goal of this pass is to // remove any things we added in NixNote that do not match // the ENML schema XMLCleanup v = new XMLCleanup(); v.setValue(newContent); logger.log(logger.HIGH, "Beginning ENML Cleanup"); v.validate(); logger.log(logger.HIGH, "Cleanup complete."); // Final pass through the data. In this one we // remove any invalid attributes and to save the // new resources. logger.log(logger.EXTREME, "Rebuilt ENML:"); logger.log(logger.EXTREME, v.getValue()); logger.log(logger.EXTREME, "End Of Rebuilt ENML:"); resources = v.getResources(); // The XML has the dtd to validate set against Evernote's web // address. We change it to a local one because otherwise it would // fail if the user doesn't have internet connectivity. The local copy // also contains the 3 other PUBLIC definitions at the beginning of the dtd. newContent = v.getValue(); File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd"); String dtd = dtdFile.toURI().toString(); newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>", "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">"); logger.log(logger.HIGH, "Validating ENML"); String repairedContent = repair.parse(newContent, true); if (repairedContent == null) logger.log(logger.EXTREME, "Null returned from repair.parse()"); else newContent = repairedContent; logger.log(logger.HIGH, "Validation complete"); saveInvalidXML = repair.saveInvalidXML; // Restore the correct XML header. newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">", "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>"); logger.log(logger.EXTREME, "Leaving ENMLConverter.convert()"); return newContent; } private String fixEncryptionTags(String content) { // Fix the problem that the document body isn't properly closed String newContent = new String(content); logger.log(logger.MEDIUM, "Inside EnmlConverter.fixEncryptionTags"); logger.log(logger.EXTREME, content); // Fix the problem that the img tag isn't properly closed int endPos, startPos, endData,slotStart, slotEnd; logger.log(logger.MEDIUM, "Checking table encryption tags"); String eTag = "<table class=\"en-crypt-temp\""; for (int i=newContent.indexOf(eTag); i>0; i = newContent.indexOf(eTag,i+1)) { slotStart = newContent.indexOf("slot", i+1)+6; slotEnd = newContent.indexOf("\"",slotStart); String slot = newContent.substring(slotStart, slotEnd); startPos = newContent.indexOf("<td>", i+1)+4; endData = newContent.indexOf("</td>",startPos); String text = newContent.substring(startPos,endData); endPos = newContent.indexOf("</table>",i+1)+8; // Encrypt the text Pair<String,String> pair = Global.passwordSafe.get(slot); String password = pair.getFirst(); String hint = pair.getSecond(); EnCrypt crypt = new EnCrypt(); String encrypted = crypt.encrypt(text, password, 64); // replace the table with an en-crypt tag. newContent = newContent.substring(0,i-1) + "<en-crypt-temp cipher=\"RC2\" length=\"64\" hint=\""+ hint +"\" value=\""+ encrypted + "\" />" + newContent.substring(endPos); } return newContent; } // Fix XML problems that Qt can't deal with public String fixStupidXMLProblems(String content) { logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems"); // Fix the problem that the document body isn't properly closed String newContent = new String(content); logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:"); logger.log(logger.EXTREME, content); // Fix the problem that the img tag isn't properly closed int endPos; logger.log(logger.MEDIUM, "Checking img tags"); for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) { endPos = newContent.indexOf(">",i+1); String end = newContent.substring(endPos+1); newContent = newContent.subSequence(0,endPos) +"/>"+end; } // Fix the problem that the input tag isn't properly closed logger.log(logger.MEDIUM, "Checking input tags"); for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) { endPos = newContent.indexOf(">",i+1); String end = newContent.substring(endPos+1); newContent = newContent.subSequence(0,endPos) +"/>"+end; } // Fix the problem that the <br> tag isn't properly closed logger.log(logger.MEDIUM, "Checking br tags"); for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) { endPos = newContent.indexOf(">",i+1); String end = newContent.substring(endPos+1); newContent = newContent.subSequence(0,endPos) +"/>"+end; } // Fix the problem that the <hr> tag isn't properly closed logger.log(logger.MEDIUM, "Checking hr tags"); for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) { endPos = newContent.indexOf(">",i+1); String end = newContent.substring(endPos+1); newContent = newContent.subSequence(0,endPos) +"/>"+end; } // Fix the problem that the <meta> tag isn't properly closed logger.log(logger.MEDIUM, "Checking meta tags"); for (int i=newContent.indexOf("<meta"); i>0; i = newContent.indexOf("<meta",i+1)) { endPos = newContent.indexOf(">",i+1); String end = newContent.substring(endPos+1); newContent = newContent.subSequence(0,endPos) +"/>"+end; } logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems"); logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems"); return newContent.toString(); } // Fix XML that Evernote thinks is invalid public String fixEnXMLCrap(String note) { logger.log(logger.EXTREME, "Entering EnmlConverter.fixEnXMLCrap"); if (note == null) return null; int pos; StringBuffer buffer = new StringBuffer(note); logger.log(logger.EXTREME, "Converting <b/>"); // change all <b/> to <b></b> because Evernote hates them if they happen in <span> pos = buffer.indexOf("<b/>"); for (; pos>-1; ) { buffer.replace(pos, pos+4, "<b></b>"); pos = buffer.indexOf("<b/>",pos); } // change all <br/> to <br></br> because Evernote hates them if they happen in <span> logger.log(logger.EXTREME, "converting <br/>"); pos = buffer.indexOf("<br/>"); for (; pos>-1; ) { buffer.replace(pos, pos+5, "<br></br>"); pos = buffer.indexOf("<br/>",pos); } // change all <span> elements in lists because Evernote hates them if they happen int endPos = 0; int spanPos; pos = buffer.indexOf("<li>"); spanPos = buffer.indexOf("<span>"); // Get rid of empty spans in <li> elements pos = buffer.indexOf("<li>"); spanPos = buffer.indexOf("<span/>"); for (; pos>-1 && spanPos >-1;) { endPos = buffer.indexOf("</li>",pos); if (spanPos > pos && spanPos < endPos) { buffer.replace(spanPos,spanPos+7,""); } pos=buffer.indexOf("<li>",pos+1); spanPos = buffer.indexOf("<span/>",spanPos); } logger.log(logger.EXTREME, "Leaving EnmlConverter.fixEnXMLCrap"); return buffer.toString(); } // Fix stupid en-media problems public String fixEnMediaCrap(String note) { if (note == null) return null; StringBuffer buffer = new StringBuffer(note); // get rid of any </en-media> tags since they shouldn't exist. int pos = buffer.indexOf("</en-media>"); for (; pos>-1; ) { buffer.replace(pos, pos+11, ""); pos = buffer.indexOf("</en-media>",pos); } // Make sure we have a proper /> ending the en-media tag pos = buffer.indexOf("<en-media"); for (; pos>-1; ) { pos=buffer.indexOf(">", pos); if (!buffer.substring(pos-1,pos).equals("/")) buffer.replace(pos, pos+1, " />"); pos = buffer.indexOf("<en-media",pos); } return buffer.toString(); } }