package lia.tika; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.SAXException; import org.xml.sax.Attributes; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.File; import java.io.InputStream; import java.io.FileInputStream; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; // From chapter 7 public class SAXXMLDocument extends DefaultHandler { private StringBuilder elementBuffer = new StringBuilder(); private Map<String,String> attributeMap = new HashMap<String,String>(); private Document doc; public Document getDocument(InputStream is) // #1 throws DocumentHandlerException { SAXParserFactory spf = SAXParserFactory.newInstance(); try { SAXParser parser = spf.newSAXParser(); parser.parse(is, this); } catch (Exception e) { throw new DocumentHandlerException( "Cannot parse XML document", e); } return doc; } public void startDocument() { // #2 doc = new Document(); } public void startElement(String uri, String localName, // #3 String qName, Attributes atts) // #3 throws SAXException { // #3 elementBuffer.setLength(0); attributeMap.clear(); int numAtts = atts.getLength(); if (numAtts > 0) { for (int i = 0; i < numAtts; i++) { attributeMap.put(atts.getQName(i), atts.getValue(i)); } } } public void characters(char[] text, int start, int length) { // #4 elementBuffer.append(text, start, length); } public void endElement(String uri, String localName, String qName) // #5 throws SAXException { if (qName.equals("address-book")) { return; } else if (qName.equals("contact")) { for (Entry<String,String> attribute : attributeMap.entrySet()) { String attName = attribute.getKey(); String attValue = attribute.getValue(); doc.add(new Field(attName, attValue, Field.Store.YES, Field.Index.NOT_ANALYZED)); } } else { doc.add(new Field(qName, elementBuffer.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); } } public static void main(String args[]) throws Exception { SAXXMLDocument handler = new SAXXMLDocument(); Document doc = handler.getDocument( new FileInputStream(new File(args[0]))); System.out.println(doc); } } /* #1 Start parser #2 Called when parsing begins #3 Beginning of new XML element #4 Append element contents to elementBuffer #5 Called when closing XML elements are processed */