package org.rdfhdt.hdt.rdf.parsers; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.util.zip.GZIPInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.util.io.ExternalDecompressStream; import org.rdfhdt.hdt.util.io.NonCloseInputStream; /** * Parses a tar file (optionally .tgz or .tar.gz or .tar.bz2) directly, processing each file that contains rdf separately. * * It uses RDFNotation.guess() to guess the format of each specific file. If not recognised, each file of the tar is ignored. * * * @author * */ public class RDFParserTar implements RDFParserCallback { /* (non-Javadoc) * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.Callback) */ @Override public void doParse(String fileName, String baseUri, RDFNotation notation, RDFCallback callback) throws ParserException { try { InputStream input; if(fileName.equals("-")) { input = System.in; } else if(fileName.endsWith(".gz") || fileName.endsWith("tgz")) { // input = new BackgroundDecompressorStream(new GZIPInputStream(new FileInputStream(fileName))); // In theory the BufferedInputStream is not neccessary, but Tar crashes when not using it. input = new BufferedInputStream(new GZIPInputStream(new FileInputStream(fileName))); } else if(fileName.endsWith("bz2") || fileName.endsWith("bz")) { input = new ExternalDecompressStream(new File(fileName), ExternalDecompressStream.PBZIP2); } else { input = new BufferedInputStream(new FileInputStream(fileName)); } this.doParse(input, baseUri, notation, callback); input.close(); } catch (Exception e) { e.printStackTrace(); throw new ParserException(); } } @Override public void doParse(InputStream input, String baseUri, RDFNotation notation, RDFCallback callback) throws ParserException { try { final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory().createArchiveInputStream("tar", input); TarArchiveEntry entry = null; // Make sure that the parser does not close the Tar Stream so we can read the rest of the files. NonCloseInputStream nonCloseIn = new NonCloseInputStream(debInputStream); while((entry = (TarArchiveEntry)debInputStream.getNextEntry()) != null) { if(entry.isFile() && !entry.getName().contains("DS_Store")) { try { RDFNotation guessnot = RDFNotation.guess(entry.getName()); System.out.println("Parse from tar: "+entry.getName()+" as "+guessnot); RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot); parser.doParse(nonCloseIn, baseUri, guessnot, callback); }catch (IllegalArgumentException e1) { e1.printStackTrace(); }catch (ParserException e1) { e1.printStackTrace(); } } } } catch (FileNotFoundException e) { e.printStackTrace(); throw new ParserException(); } catch (Exception e) { e.printStackTrace(); throw new ParserException(); } } }