/**
*
*/
package org.ariadne_eu.utils.lucene.document;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.eun.lucene.core.indexer.document.DocumentHandler;
import org.eun.lucene.core.indexer.document.DocumentHandlerException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
* @author gonzalo
*
*/
public class SWRCHandler extends DocumentHandler {
private static final String[] MIN_MAX = { "min", "max" };
/** A buffer for each XML element */
private StringBuffer elementBuffer = new StringBuffer();
private String branche = "", id = "", about = "", name = "", contents;
private Document doc;
private final String BRANCH_SEPARATOR = ".";
private final String ATT_SEPARATOR = ".";
private final String EQUAL_SEPARATOR = "=";
public Document getDocument(InputStream is) throws DocumentHandlerException {
SAXParserFactory spf = SAXParserFactory.newInstance();
try {
SAXParser parser = spf.newSAXParser();
parser.parse(is, this);
} catch (IOException e) {
throw new DocumentHandlerException("Cannot parse XML document", e);
} catch (ParserConfigurationException e) {
throw new DocumentHandlerException("Cannot parse XML document", e);
} catch (SAXException e) {
throw new DocumentHandlerException("Cannot parse XML document", e);
}
return doc;
}
public void startDocument() {
doc = new Document();
contents = new String();
}
public void endDocument() {
doc.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED));
}
/*
* Save the attribute in a map to reuse it when the element ends (only used
* for the last element of a branch) Add an attribute field Incremental
* string creation to represent the current branch parsed
*
* @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String,
* java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
branche += qName.toLowerCase();
String tmpBranche = branche.substring(0, branche.length());
if (tmpBranche.contains(":")) {
tmpBranche = tmpBranche.replaceAll("(\\w+):", "");
}
elementBuffer.setLength(0);
if (tmpBranche.matches(".*person") || tmpBranche.matches(".*publication") || tmpBranche.matches(".*organization")) {
if (atts.getLength() > 0) {
for (int i = 0; i < atts.getLength(); i++) {
if (atts.getQName(i).equalsIgnoreCase("rdf:about")) {
about = atts.getValue(i);
doc.add(new Field(tmpBranche.toLowerCase()+".about", about, Field.Store.YES, Field.Index.NOT_ANALYZED));
} else if (atts.getQName(i).equalsIgnoreCase("rdf:id")) {
id = atts.getValue(i);
doc.add(new Field(tmpBranche.toLowerCase()+".id", id, Field.Store.YES, Field.Index.NOT_ANALYZED));
contents = contents.concat(" " + id);
}
}
}
if(tmpBranche.matches("rdf\\.person.*")) {
doc.add(new Field("type", "person", Field.Store.YES, Field.Index.NOT_ANALYZED));
}else if(tmpBranche.matches("rdf\\.publication.*")) {
doc.add(new Field("type", "publication", Field.Store.YES, Field.Index.NOT_ANALYZED));
} else if(tmpBranche.matches("rdf\\.organization.*")) {
doc.add(new Field("type", "organization", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
}
branche += BRANCH_SEPARATOR;
}
public void characters(char[] text, int start, int length) {
elementBuffer.append(text, start, length);
}
public void endElement(String uri, String localName, String qName) throws SAXException {
String tmpBranche = branche.substring(0, branche.length() - 1);
if (tmpBranche.contains(":")) {
tmpBranche = tmpBranche.replaceAll("(\\w+):", "");
}
String tmp2Branche = "";
if (branche.endsWith(qName.toLowerCase() + "" + BRANCH_SEPARATOR)) {
branche = branche.substring(0, branche.length() - qName.length() - 1);
if (!branche.equals(""))
tmp2Branche = branche.substring(0, branche.length() - 1);
}
if (elementBuffer.toString().trim().equals("")) {
return;
}
//Person
if (tmpBranche.matches(".*person\\.((givenname)|(family_name))")) {
if (tmpBranche.endsWith("givenname")) {
name = name.concat(elementBuffer.toString() + " ");
} else if (tmpBranche.endsWith("family_name")) {
name = name.concat(elementBuffer.toString());
contents = contents.concat(" " + name);
if (tmpBranche.startsWith("rdf.person"))
doc.add(new Field("rdf.person.name", name, Field.Store.YES, Field.Index.ANALYZED));
name = "";
}
}
//Publication
else if (tmpBranche.matches(".*publication.title")) {
doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));
contents = contents.concat(" " + elementBuffer.toString());
}
else if (tmpBranche.matches(".*publication.year")) {
doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
contents = contents.concat(" " + elementBuffer.toString());
}
else if (tmpBranche.matches(".*publication.keywords")) {
String keywords = elementBuffer.toString().replaceAll(",", " ");
doc.add(new Field(tmpBranche.toLowerCase(), keywords, Field.Store.YES, Field.Index.ANALYZED));
contents = contents.concat(" " + keywords);
}
else if (tmpBranche.matches(".*publication.spatial")) {
String spatial = elementBuffer.toString().replaceAll(",", " ");
doc.add(new Field(tmpBranche.toLowerCase(), spatial, Field.Store.YES, Field.Index.ANALYZED));
contents = contents.concat(" " + spatial);
}
//Affiliation
else if (tmpBranche.matches(".*organization.fn")) {
doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
contents = contents.concat(" " + elementBuffer.toString());
}
else if (tmpBranche.matches(".*organization.adr.description.*")) {
if (tmpBranche.endsWith("locality")) {
doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));
contents = contents.concat(" " + elementBuffer.toString());
} else if (tmpBranche.endsWith("country-name")) {
doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));
contents = contents.concat(" " + elementBuffer.toString());
}
}
// doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX
// to store the contents without metatags
// contents = contents.concat(" " +
// elementBuffer.toString().toLowerCase());
elementBuffer.setLength(0);
}
public static void main(String args[]) throws Exception {
SWRCHandler handler = new SWRCHandler();
Document doc = handler.getDocument(new FileInputStream(new File("/work/tmp/research.fm/xmls/lirias/persons/gonzalo-parra.xml")));
List fields = doc.getFields();
for (Iterator iterator = fields.iterator(); iterator.hasNext();) {
Field field = (Field) iterator.next();
System.out.println(field.name() + " :: " + field.stringValue());
}
}
}