package gr.ntua.ivml.athena.harvesting; import gr.ntua.ivml.athena.harvesting.xml.schema.HeaderType; import gr.ntua.ivml.athena.harvesting.xml.schema.ListIdentifiersType; import gr.ntua.ivml.athena.harvesting.xml.schema.ListMetadataFormatsType; import gr.ntua.ivml.athena.harvesting.xml.schema.MetadataType; import gr.ntua.ivml.athena.harvesting.xml.schema.OAIPMHtype; import gr.ntua.ivml.athena.harvesting.xml.schema.ObjectFactory; import gr.ntua.ivml.athena.harvesting.xml.schema.RecordType; import gr.ntua.ivml.athena.persistent.ReportI; import gr.ntua.ivml.athena.util.Config; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Iterator; import java.util.Random; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; import javax.xml.bind.Marshaller; import javax.xml.bind.Unmarshaller; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import org.apache.log4j.Logger; import org.apache.xerces.dom.ElementNSImpl; import org.apache.xml.serialize.DOMSerializerImpl; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import ORG.oclc.oai.harvester2.verb.GetRecord; import ORG.oclc.oai.harvester2.verb.ListIdentifiers; import ORG.oclc.oai.harvester2.verb.ListMetadataFormats; import ORG.oclc.oai.harvester2.verb.ListRecords; import de.schlichtherle.io.DefaultArchiveDetector; import de.schlichtherle.io.File; import de.schlichtherle.io.FileOutputStream; public class SingleHarvester /*implements ServletContextAware*/ { private ReportI reporter; private String baseURL = null; private String startDate = null; private String endDate = null; private String ns = null; private String set = null; private JAXBContext jc; private String resumptionToken = null; private ArrayList<String> identifiers = null; private String fileName = null; private ObjectFactory fact; public final static Logger log = Logger.getLogger( SingleHarvester.class ); public SingleHarvester(String baseURL, String startDate, String endDate, String ns, String set){ this.baseURL = baseURL; this.startDate = startDate; this.endDate = endDate; this.ns = ns; this.set = set; System.out.println(Config.getRealPath(Config.get("oaitmp"))); Random generator = new Random(); this.fileName = Config.getRealPath(Config.get("oaitmp"))+ "oai"+generator.nextInt()+".zip"; try { jc = JAXBContext.newInstance( "gr.ntua.ivml.athena.harvesting.xml.schema" ); fact = new ObjectFactory(); } catch (JAXBException e) { e.printStackTrace(); } /* Random generator = new Random(); this.fileName ="F:\\athena\\" + "oai"+generator.nextInt()+".zip";*/ } public String getFileName(){return this.fileName;} public void harvest() throws Exception{ ListRecords records = null; int counter = 0; long lastReport = System.currentTimeMillis(); records = new ListRecords(this.baseURL, this.startDate, this.endDate, this.set, this.ns); File file = new File(this.fileName, new DefaultArchiveDetector( "zip" )); while(records != null){ NodeList errors = records.getErrors(); if (errors != null && errors.getLength() > 0) { log.info( "OAI error"); String oaiError = ""; int length = errors.getLength(); for (int i=0; i<length; ++i) { Node item = errors.item(i); oaiError += item; } try{ if(oaiError!=null && oaiError.length()>0 && oaiError.indexOf("null")>-1){ oaiError=records.toString(); if(oaiError.indexOf("error")>-1){ oaiError=oaiError.substring(oaiError.indexOf("<error"),oaiError.length()); oaiError=oaiError.substring(oaiError.indexOf(">")+1,oaiError.indexOf("</error>")); } } } catch (Exception e){ log.error( e ); } if(oaiError!=null && oaiError.length()>0){ Exception e = new Exception(oaiError); throw e;} //break; } if((( System.currentTimeMillis() - lastReport) > 30000l ) && ( reporter != null )) { reporter.report("Entry " + counter + " written." ); log.debug( "Written " + fileName + "/response" + counter + ".xml" ); lastReport = System.currentTimeMillis(); } //OutputStream out = new FileOutputStream(fileName+"/response"+ counter++ +".xml"); OutputStream out = null; OutputStream bout= null; OutputStreamWriter out1 = null; String tmp = ""; OAIPMHtype res = this.getRecordsXML(records.toString()); Iterator<RecordType> it = res.getListRecords().getRecord().iterator(); while(it.hasNext()){ RecordType typ = it.next(); MetadataType met = typ.getMetadata(); ElementNSImpl la =(ElementNSImpl) met.getAny(); out = new FileOutputStream(fileName+"/"+counter++ +".xml"); DOMSerializerImpl ser = new DOMSerializerImpl(); tmp = ser.writeToString(la); bout= new BufferedOutputStream(out); out1 = new OutputStreamWriter(bout, "UTF8"); out1.write(tmp); out1.flush(); //out1.write(tmp.getBytes()); out.close(); bout.close(); out1.close(); } //Marshaller m; //ByteArrayOutputStream stream = new ByteArrayOutputStream(); //m = jc.createMarshaller(); //m.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, // Boolean.TRUE); //m.marshal(fact.createOAIPMH(res), stream); //out.write(stream.toByteArray()); //out.close(); resumptionToken = records.getResumptionToken(); if (resumptionToken == null || resumptionToken.length() == 0) { records = null; } else { records = new ListRecords(baseURL, resumptionToken); } } File.umount(); } @SuppressWarnings("unchecked") private ListMetadataFormatsType getMatadataFormats(String baseURL){ ListMetadataFormats formats = null; ListMetadataFormatsType result = null; try { formats = new ListMetadataFormats(baseURL); Unmarshaller u = jc.createUnmarshaller(); String xmlRec = formats.toString(); if(xmlRec.startsWith("<?")){ int offset = xmlRec.indexOf("?>"); xmlRec = xmlRec.substring(offset+2); } InputStream is = new ByteArrayInputStream(xmlRec.getBytes("UTF-8")); JAXBElement<OAIPMHtype> oai = (JAXBElement<OAIPMHtype>)u.unmarshal(is); OAIPMHtype response = oai.getValue(); result = response.getListMetadataFormats(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } catch (JAXBException e) { e.printStackTrace(); } return result; } @SuppressWarnings({ "unchecked", "unused" }) private ListMetadataFormatsType getMetadataFormats(String identifier, String baseURL){ ListMetadataFormats formats = null; ListMetadataFormatsType result = null; try { formats = new ListMetadataFormats(baseURL, identifier); Unmarshaller u = jc.createUnmarshaller(); String xmlRec = formats.toString(); if(xmlRec.startsWith("<?")){ int offset = xmlRec.indexOf("?>"); xmlRec = xmlRec.substring(offset+2); } InputStream is = new ByteArrayInputStream(xmlRec.getBytes("UTF-8")); JAXBElement<OAIPMHtype> oai = (JAXBElement<OAIPMHtype>)u.unmarshal(is); OAIPMHtype response = oai.getValue(); result = response.getListMetadataFormats(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } catch (JAXBException e) { e.printStackTrace(); } return result; } @SuppressWarnings({ "unchecked" }) private OAIPMHtype getRecordsXML(String response){ OAIPMHtype result = null; Unmarshaller u; try { u = jc.createUnmarshaller(); String xmlRec = response.toString(); if(xmlRec.startsWith("<?")){ int offset = xmlRec.indexOf("?>"); xmlRec = xmlRec.substring(offset+2); } InputStream is = new ByteArrayInputStream(xmlRec.getBytes("UTF-8")); JAXBElement<OAIPMHtype> oai = (JAXBElement<OAIPMHtype>)u.unmarshal(is); result = oai.getValue(); } catch (JAXBException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return result; } @SuppressWarnings({ "unchecked", "unused" }) private OAIPMHtype getRecordData(String baseURL, String identifier, String format){ OAIPMHtype result = null; try { GetRecord record = new GetRecord(baseURL, identifier, format); Unmarshaller u = jc.createUnmarshaller(); String xmlRec = record.toString(); if(xmlRec.startsWith("<?")){ int offset = xmlRec.indexOf("?>"); xmlRec = xmlRec.substring(offset+2); } InputStream is = new ByteArrayInputStream(xmlRec.getBytes("UTF-8")); JAXBElement<OAIPMHtype> oai = (JAXBElement<OAIPMHtype>)u.unmarshal(is); result = oai.getValue(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } catch (JAXBException e) { e.printStackTrace(); } return result; } @SuppressWarnings({ "unchecked", "unused" }) private ArrayList<String> extractIdentifiers(ListIdentifiers idents){ ArrayList<String> results = new ArrayList<String>(); try { Unmarshaller u = jc.createUnmarshaller(); String xmlRec = idents.toString(); if(xmlRec.startsWith("<?")){ int offset = xmlRec.indexOf("?>"); xmlRec = xmlRec.substring(offset+2); } InputStream is = new ByteArrayInputStream(xmlRec.getBytes("UTF-8")); JAXBElement<OAIPMHtype> oai = (JAXBElement<OAIPMHtype>)u.unmarshal(is); OAIPMHtype response = oai.getValue(); ListIdentifiersType identifiers = response.getListIdentifiers(); Iterator<HeaderType> itr = identifiers.getHeader().iterator(); HeaderType recType = null; int count = 0; while(itr.hasNext()){ recType = itr.next(); results.add(recType.getIdentifier()); } } catch (JAXBException e) { e.printStackTrace(); }catch (UnsupportedEncodingException e) { e.printStackTrace(); } return results; } public ReportI getReporter() { return reporter; } public void setReporter(ReportI reporter) { this.reporter = reporter; } /* @Override public void setServletContext(ServletContext sc) { System.out.println("Mphka edw mesa reee!!!!!"); Random generator = new Random(); //this.fileName=sc.getRealPath(Config.get("oaitmp"))+ "oai"+generator.nextInt()+".zip"; }*/ }