/*
* (C) Copyright IBM Corp. 2008
*
* LICENSE: Eclipse Public License v1.0
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.ibm.gaiandb.searchapis;
/**
* @author gabent
*
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.derby.iapi.types.DataValueDescriptor;
import org.apache.derby.iapi.types.SQLChar;
import org.apache.derby.iapi.types.SQLInteger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.ibm.gaiandb.Logger;
public class DomParser {
// Use PROPRIETARY notice if class contains a main() method, otherwise use COPYRIGHT notice.
public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2008";
private static final Logger logger = new Logger( "DomParser", 30 );
//No generics
List<Entries> myEntries;
Document dom;
public DomParser(){
//create a list to hold the Entries
myEntries = new ArrayList<Entries>();
}
// public void runExample() {
// File fileName = new File("c:\\TestXML.xml");
// FileInputStream is;
// try {
// is = new FileInputStream(fileName);
// //parse the xml file and get the dom object
// parseXmlFile(is);
// } catch (FileNotFoundException e) {
// e.printStackTrace();
// }
//
//
// //get each employee element and create a Employee object
// parseDocument();
//
// //Iterate through the list and print the data
// printData();
//
// }
//
//
public void parseXmlFile(InputStream is){
//get the factory
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
//Using factory get an instance of document builder
DocumentBuilder db = dbf.newDocumentBuilder();
//parse using builder to get DOM representation of the XML file
// dom = db.parse("c:\\TestXML.xml");
dom = db.parse(is);
}catch(ParserConfigurationException pce) {
pce.printStackTrace();
}catch(SAXException se) {
se.printStackTrace();
}catch(IOException ioe) {
ioe.printStackTrace();
}
}
public void parseDocument(){
//get the root elememt
Element docEle = dom.getDocumentElement();
//get a nodelist of <entry> elements
NodeList nl = docEle.getElementsByTagName("entry");
if(nl != null && nl.getLength() > 0) {
for(int i = 0 ; i < nl.getLength();i++) {
//get the entry element
Element el = (Element)nl.item(i);
//get the entry object
Entries e = getEntry(el);
//add it to list
myEntries.add(e);
}
}
}
// public void parseAndStoreDocument(Connection conn){
// //get the root elememt
// Element docEle = dom.getDocumentElement();
//
// //get a nodelist of <entry> elements
// NodeList nl = docEle.getElementsByTagName("entry");
// try {
// PreparedStatement pstmt = conn.prepareStatement("Insert into documents(dnum) values(?)");
// PreparedStatement clearTable = conn.prepareStatement("Delete from documents");
// System.out.println("clear documents table");
// clearTable.executeUpdate();
// System.out.println("documents table cleared");
// if(nl != null && nl.getLength() > 0) {
// for(int i = 0 ; i < nl.getLength();i++) {
//
// // get the entry element
// Element el = (Element)nl.item(i);
// String documentPath = getTextValue(el,"id");
// //documentPath = (String) documentPath.subSequence(0,documentPath.length()-1);
// System.out.println("Document ID: = " + documentPath);
//
// int id = documentPath.hashCode();
// System.out.println("Document HashID: = " +id);
// String Test = "file://localhost/C:/temp/crawlme/Germany_sends_jets_to_Afghanistan.txt";
// System.out.println("Document Test HashID: = " + Test.hashCode());
// String updated = getTextValue(el,"updated");
// System.out.println("inserting doc_id: "+id);
// pstmt.setInt(1,id);
// pstmt.executeUpdate();
// }
// }
// } catch (SQLException e) {
// e.printStackTrace();
// }
// }
public void parseAndStoreDocument(Vector<DataValueDescriptor[]> rows) throws UnsupportedEncodingException{
//get the root elememt
Element docEle = dom.getDocumentElement();
//get a nodelist of <entry> elements
NodeList nl = docEle.getElementsByTagName("entry");
logger.logInfo("Building document result rows vector, with Doc ID hash values based on encoded URIs");
int idx = 0;
if(nl != null && nl.getLength() > 0) {
for (; idx < nl.getLength(); idx++) {
// get the entry element
Element el = (Element)nl.item(idx);
String uri = getTextValue(el,"id");
//documentPath = (String) documentPath.subSequence(0,documentPath.length()-1);
// DRV 09/12/10 - Removing:
// 1) decoding of URIs and hence, 2) option to hash on the encoded or decoded URIs
// String documentPath = URLDecoder.decode( uri, Charset.defaultCharset().name() );
// int id = hashDecodedPaths ? documentPath.hashCode() : uri.hashCode();
int id = uri.hashCode();
// String Test = "file://localhost/C:/temp/crawlme/Germany_sends_jets_to_Afghanistan.txt";
// logger.logInfo("Document Test HashID: = " + Test.hashCode());
// String updated = getTextValue(el,"updated");
logger.logDetail("Adding row with docHashID: " + id + ", docURI: " + uri);
rows.add( new DataValueDescriptor[] { new SQLInteger(id), new SQLChar(uri) } );
}
}
logger.logInfo("Number of rows added: " + idx);
}
/**
* Take an entry element and read the values in, create
* a entry object and return it
* @param entryEl
* @return
*/
private Entries getEntry(Element entryEl) {
//for each <entry> element get text values of
//id and updated
String id = getTextValue(entryEl,"id");
String updated = getTextValue(entryEl,"updated");
//Create a new Entry with the value read from the xml nodes
Entries e = new Entries(id,updated);
return e;
}
/**
* I take a xml element and the tag name, look for the tag and get
* the text content
* @param ele
* @param tagName
* @return
*/
private String getTextValue(Element ele, String tagName) {
String textVal = null;
NodeList nl = ele.getElementsByTagName(tagName);
if(nl != null && nl.getLength() > 0) {
Element el = (Element)nl.item(0);
textVal = el.getFirstChild().getNodeValue();
}
return textVal;
}
//
//
// /**
// * Calls getTextValue and returns a int value
// * @param ele
// * @param tagName
// * @return
// */
// private int getIntValue(Element ele, String tagName) {
// //in production application you would catch the exception
// return Integer.parseInt(getTextValue(ele,tagName));
// }
//
/**
* Iterate through the list and print the
* content to console
*/
public void printData(){
logger.logInfo("No documents '" + myEntries.size() + "'.");
Iterator<Entries> it = myEntries.iterator();
while(it.hasNext()) {
logger.logInfo(it.next().toString());
}
}
//
//
// public static void main(String[] args){
// //create an instance
// DomParser dpe = new DomParser();
//
// //call run example
// dpe.runExample();
// }
}