/* * Copyright (c) 2008 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.dmozdl; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.SortedMap; import java.util.TreeMap; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import com.hp.hpl.jena.iri.IRI; import com.hp.hpl.jena.iri.IRIFactory; public class DmozHandler implements ContentHandler { private SortedMap<String, String> theMap = new TreeMap<String, String>(); private MessageDigest md; private IRIFactory fac = new IRIFactory(); private boolean inCollectableTopic; /** * @see org.xml.sax.ContentHandler#characters(char[], int, int) */ public void characters(char[] ch, int start, int length) throws SAXException { } public void endDocument() throws SAXException { } public void endElement(String uri, String localName, String name) throws SAXException { if ("http://dmoz.org/rdf" == uri && "Topic" == localName) { inCollectableTopic = false; } } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void skippedEntity(String name) throws SAXException { } public void startDocument() throws SAXException { inCollectableTopic = false; } private static String toHexString(byte[] md5) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < md5.length; i++) { byte b = md5[i]; int asInt = ((int) b) & 0xFF; String s = Integer.toHexString(asInt); if (s.length() == 1) { sb.append('0'); } sb.append(s); } return sb.toString(); } public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { if ("http://dmoz.org/rdf" == uri) { if ("Topic" == localName) { String id = atts.getValue("http://www.w3.org/TR/RDF/", "id"); if (id != null && !id.startsWith("Top/Adult")) { inCollectableTopic = true; } } else if (inCollectableTopic && "link" == localName) { String resource = atts.getValue("http://www.w3.org/TR/RDF/", "resource"); if (resource != null) { try { IRI iri = fac.create(resource); String u = iri.toASCIIString(); byte[] md5 = md.digest(u.getBytes("utf-8")); String md5str = toHexString(md5); theMap.put(md5str, u); } catch (Exception e) { } } } } } public void startPrefixMapping(String prefix, String uri) throws SAXException { } /** * @throws NoSuchAlgorithmException * */ public DmozHandler() throws NoSuchAlgorithmException { md = MessageDigest.getInstance("MD5"); } /** * Returns the theMap. * * @return the theMap */ public SortedMap<String, String> getTheMap() { return theMap; } }