package org.ariadne_eu.utils.lucene.document; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.eun.lucene.core.indexer.document.DocumentHandler; import org.eun.lucene.core.indexer.document.DocumentHandlerException; import org.xml.sax.Attributes; import org.xml.sax.SAXException; public class LOMLiteHandler extends DocumentHandler { private static Logger log = Logger.getLogger(LOMLiteHandler.class); private static final String[] MIN_MAX = { "min", "max" }; /** A buffer for each XML element */ private StringBuffer elementBuffer = new StringBuffer(); private HashMap<String, String> attributeMap = new HashMap<String, String>(); private String branche = ""; private Document doc; private String contents; private final String BRANCH_SEPARATOR = "."; private final String ATT_SEPARATOR = "."; private final String EQUAL_SEPARATOR = "="; public Document getDocument(InputStream is) throws DocumentHandlerException { SAXParserFactory spf = SAXParserFactory.newInstance(); try { SAXParser parser = spf.newSAXParser(); parser.parse(is, this); } catch (IOException e) { throw new DocumentHandlerException("Cannot parse XML document", e); } catch (ParserConfigurationException e) { throw new DocumentHandlerException("Cannot parse XML document", e); } catch (SAXException e) { throw new DocumentHandlerException("Cannot parse XML document", e); } return doc; } public void startDocument() { doc = new Document(); contents = new String(); } public void endDocument() { doc.add(new Field("contents", contents, Field.Store.YES,Field.Index.ANALYZED)); doc.add(new Field("lom.solr", "all", Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); } /* * Save the attribute in a map to reuse it when the element ends (only used * for the last element of a branch) Add an attribute field Incremental * string creation to represent the current branch parsed * * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, * java.lang.String, java.lang.String, org.xml.sax.Attributes) */ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { branche += qName.toLowerCase(); elementBuffer.setLength(0); attributeMap.clear();// No need for a map :D branche += BRANCH_SEPARATOR; } public void characters(char[] text, int start, int length) { elementBuffer.append(text, start, length); } public void endElement(String uri, String localName, String qName) throws SAXException { String tmpBranche = branche.substring(0, branche.length() - 1); //remove the NS+colons on any element if (tmpBranche.contains(":")) { tmpBranche = tmpBranche.replaceAll("(\\w+):", ""); } String tmp2Branche = ""; if (branche.endsWith(qName.toLowerCase() + "" + BRANCH_SEPARATOR)) { branche = branche.substring(0, branche.length() - qName.length()- 1); if (!branche.equals("")) tmp2Branche = branche.substring(0, branche.length() - 1); } if (elementBuffer.toString().trim().equals("")) { return; } // Attributes for string element ... (ex. Save the field by language) if (qName.equalsIgnoreCase("string")) { Iterator iter = attributeMap.keySet().iterator(); while (iter.hasNext()) { String attName = ((String) iter.next()).toLowerCase(); String attValue = ((String) attributeMap.get(attName)).toLowerCase(); String fieldName = tmpBranche + "" + ATT_SEPARATOR + "" + attName + "" + EQUAL_SEPARATOR + "" + attValue; } } // Hardcoded for LOM XML specifications --> // Classification ... // 23/10/12 Change for supporting classification related facets @NaturalEurope if (tmpBranche.matches(".*classification.*")) { String format = elementBuffer.toString().toLowerCase().trim(); doc.add(new Field(tmpBranche.toLowerCase(), format, Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX } /* ------- 23/10/12 Change for supporting classification facet @NaturalEurope if (tmpBranche.matches(".*classification\\.((purpose)|(taxonpath)).*")) { if (tmpBranche.endsWith("classification.taxonpath.taxon.entry.string")) { doc.add(new Field(tmpBranche, elementBuffer.toString().trim().toLowerCase(), Field.Store.YES,Field.Index.ANALYZED));// XXX } contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); }*/ // Title else if (tmpBranche.matches(".*title.*")) { if (tmpBranche.endsWith("title.string")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim(), Field.Store.YES,Field.Index.ANALYZED));// XXX } contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // Catalog + entry else if (tmpBranche.matches(".*general.identifier\\.((catalog)|(entry))")) { if (tmpBranche.endsWith("identifier.catalog")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim(), Field.Store.YES,Field.Index.NOT_ANALYZED));// XXX } else if (tmpBranche.endsWith("identifier.entry")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); } contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // Metametadata Catalog + entry else if (tmpBranche.matches(".*metametadata.identifier\\.((catalog)|(entry))")) { if (tmpBranche.endsWith("identifier.catalog")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim(), Field.Store.YES,Field.Index.NOT_ANALYZED));// XXX } else if (tmpBranche.endsWith("identifier.entry")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); } contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // Metametadata role.source + role.value else if (tmpBranche.matches(".*metametadata.contribute.role\\.((source)|(value))")) { if (tmpBranche.endsWith("role.source")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED));// XXX } else if (tmpBranche.endsWith("role.value")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); } contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // rights else if (tmpBranche.matches(".*rights.*")) { String format = elementBuffer.toString().toLowerCase().trim(); doc.add(new Field(tmpBranche.toLowerCase(), format, Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX } // technical.format else if (tmpBranche.matches(".*technical.format.*")) { String format = elementBuffer.toString().toLowerCase().trim(); doc.add(new Field(tmpBranche.toLowerCase(), format, Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX } // technical.location else if (tmpBranche.matches(".*technical.location.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX } // technical.duration else if (tmpBranche.matches(".*technical.duration")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX } // general.description.string ---> general.* else if (tmpBranche.matches(".*general.*")) { String format = elementBuffer.toString().trim(); doc.add(new Field(tmpBranche.toLowerCase(), format, Field.Store.YES, Field.Index.ANALYZED));// XXX contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } /* // general.keyword.string else if (tmpBranche.matches(".*general.keyword.string")) { String format = elementBuffer.toString().toLowerCase().trim(); doc.add(new Field(tmpBranche.toLowerCase(), format, Field.Store.YES, Field.Index.ANALYZED));// XXX contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); }*/ // learningresourcetype.value else if (tmpBranche.matches(".*learningresourcetype.value.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // interactivitytype.value else if (tmpBranche.matches(".*interactivitytype.value.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // interactivitylevel.value else if (tmpBranche.matches(".*interactivitylevel.value.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // intendedenduserrole.value else if (tmpBranche.matches(".*intendedenduserrole.value.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // typicalagerange.string else if (tmpBranche.matches(".*typicalagerange.string.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // context.value else if (tmpBranche.matches(".*context.value.*")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().toLowerCase(), Field.Store.YES,Field.Index.NOT_ANALYZED)); contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } // general.language else if (tmpBranche.matches(".*general.language")) { doc.add(new Field(tmpBranche.toLowerCase(), elementBuffer.toString().trim().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));// XXX contents = contents.concat(" " + elementBuffer.toString().toLowerCase()); } elementBuffer.setLength(0); } public static void main(String args[]) throws Exception { LOMLiteHandler handler = new LOMLiteHandler(); Document doc = handler.getDocument(new FileInputStream(new File("/Sandbox/temp/AriadneWS/mdstoreARIADNE/ARIADNE/BLKLKP325.xml"))); List fields = doc.getFields(); for (Iterator iterator = fields.iterator(); iterator.hasNext();) { Field field = (Field) iterator.next(); System.out.println(field.name() + " :: " + field.stringValue()); } } }