/** * */ package org.ariadne_eu.utils.lucene.reindex; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; import java.util.List; import java.util.Vector; import java.util.zip.ZipEntry; import java.util.zip.ZipException; import java.util.zip.ZipFile; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.log4j.Logger; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.apache.xpath.XPathAPI; import org.ariadne.config.PropertiesManager; import org.ariadne.util.Stopwatch; import org.ariadne_eu.metadata.insert.InsertMetadataFactory; import org.ariadne_eu.metadata.insert.InsertMetadataImpl; import org.ariadne_eu.metadata.insert.InsertMetadataLuceneImpl; import org.ariadne_eu.utils.config.RepositoryConstants; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.xml.sax.InputSource; /** * @author gonzalo * */ public class ReIndexZipImpl extends ReIndexImpl { private static Logger log = Logger.getLogger(ReIndexZipImpl.class); private String dirString; private static Vector xpathQueries; private DocumentBuilder builder; public ReIndexZipImpl() { initialize(); } void initialize() { super.initialize(); try { dirString = PropertiesManager.getInstance().getProperty(RepositoryConstants.getInstance().MD_SPIFS_DIR ); if (dirString == null) log.error("initialize failed: no " + RepositoryConstants.getInstance().MD_SPIFS_DIR + " found"); File dir = new File(dirString); if (!dir.isDirectory()) log.error("initialize failed: " + RepositoryConstants.getInstance().MD_SPIFS_DIR + " invalid directory"); dirString = dir.getParent(); xpathQueries = new Vector(); if (PropertiesManager.getInstance().getProperty(RepositoryConstants.getInstance().SR_XPATH_QRY_ID + ".1") == null) xpathQueries.add("general/identifier/entry/text()"); else { int i = 1; while(PropertiesManager.getInstance().getProperty(RepositoryConstants.getInstance().SR_XPATH_QRY_ID + "." + i) != null) { xpathQueries.add(PropertiesManager.getInstance().getProperty(RepositoryConstants.getInstance().SR_XPATH_QRY_ID + "." + i)); i++; } } //TODO: check for valid lucene index builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch (Throwable t) { log.error("initialize: ", t); } } public void reIndexMetadata() { File mdFile; File dir = new File(dirString); File[] files = dir.listFiles(); System.out.println("Starting reindexation"); InsertMetadataImpl[] insertImpls = InsertMetadataFactory.getInsertImpl(); InsertMetadataLuceneImpl luceneImpl = null; for (int i = 0; i < insertImpls.length; i++) { InsertMetadataImpl insertImpl = insertImpls[i]; if (insertImpl instanceof InsertMetadataLuceneImpl) luceneImpl = (InsertMetadataLuceneImpl) insertImpl; } if (luceneImpl == null) return; luceneImpl.createLuceneIndex(); String implementation = PropertiesManager.getInstance().getProperty(RepositoryConstants.getInstance().MD_INSERT_IMPLEMENTATION); if (implementation != null) { for (int i = 0; i < files.length; i++) { mdFile = files[i]; System.out.println(mdFile.getAbsolutePath()); if (!mdFile.isDirectory() && mdFile.getName().endsWith(".zip")) { indexFile(mdFile, luceneImpl, new String[0]); } } } } private void indexFile (File mdFile, InsertMetadataLuceneImpl luceneImpl, String[] cName) { try { ZipFile zip = new ZipFile(mdFile); for (Enumeration e = zip.entries(); e.hasMoreElements();) { ZipEntry entry = (ZipEntry) e.nextElement(); String[] collection = entry.getName().split("/"); collection = Arrays.asList(collection).subList(0, collection.length -1).toArray(new String[1]); InputStream is = zip.getInputStream(entry); String xml = readInputStream(is,"UTF-8"); try { Document doc = getDoc(xml); String identifier = getIdentifier(doc); StringWriter out = new StringWriter(); XMLSerializer serializer = new XMLSerializer(out, new OutputFormat(doc)); serializer.serialize((Element) doc.getFirstChild()); String lom = out.toString(); if (identifier != null) luceneImpl.insertMetadata(identifier, lom, collection); } catch (Exception ex) { log.error("indexFile: fileName=" + mdFile.getName(), ex); } } } catch (ZipException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } private static String getIdentifier (Document doc) { String identifier = null; for (int j = 0; j < xpathQueries.size() && identifier == null; j++) { String xpathQuery = (String) xpathQueries.elementAt(j); try { identifier = XPathAPI.selectSingleNode(doc.getFirstChild(),xpathQuery).getNodeValue(); } catch (Exception e) { log.debug("getIdentifier", e); } } return identifier; } private Document getDoc (String xml) { Document doc = null; StringReader stringReader = new StringReader(xml); InputSource input = new InputSource(stringReader); try { doc = builder.parse(input); } catch (Exception e) { log.error("getDoc:",e); } return doc; } public static String readInputStream(InputStream is, String encoding){ try { if (is != null) { Writer writer = new StringWriter(); char[] buffer = new char[1024]; try { Reader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); int n; while ((n = reader.read(buffer)) != -1) { writer.write(buffer, 0, n); } } finally { is.close(); } return writer.toString(); } else { return ""; } } catch (Exception e) { log.error("readInputStream:",e); } return ""; } }