/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport.Medline;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class GeneratePMID2XMLFile{
public static String dir = "/home/data/Medline/2010/";
public static String outputFile = "/home/temp/pmid2xml.txt";
public static int maxRetries = 3;
public static void main (String[] args) throws Exception {
new GeneratePMID2XMLFile(args);
}
public GeneratePMID2XMLFile(String[] args) throws Exception {
/*if (args.length == 0){
System.out.println("usage: arg1 = folder, arg2 = outputfile");
return;
} else {
dir = args[0];
outputFile = args[1];
}*/
File directory = new File(dir);
List<File> fileList = getFileListing(directory);
WriteTextFile out = new WriteTextFile(outputFile);
boolean cont = false;
Iterator<File> filesIter = fileList.iterator();
while (filesIter.hasNext()) {
String fn = filesIter.next().getAbsolutePath();
//Use these lines if you don't want to start at the beginning:
//if (fn.contains("0700"))
// cont = true;
//if (cont)
if (fn.endsWith(".gz")) {
GZIPInputStream input = new GZIPInputStream(new FileInputStream(fn));
FileOutputStream output = new FileOutputStream(fn.substring(0, fn.length() - 3));
copyStream(input,output);
output.close();
File del = new File(fn.substring(0, fn.length() - 3));
process(out, del.getAbsolutePath(), del.getName());
int retries = 0;
while (!del.delete() && retries < maxRetries){
System.err.println("Unable to delete " + del.getAbsolutePath() + ", retrying....");
Thread.sleep(1000);
retries++;
}
}
}
out.close();
}
private static void process(WriteTextFile out, String fullpath, String filename) {
System.out.println("Processing " + fullpath);
ReadTextFile in = new ReadTextFile(fullpath);
for (String line : in){
String pmid = StringUtilities.findBetween(line, "<PMID>", "</PMID>");
if (!pmid.equals(""))
out.writeln(pmid + "\t" + filename);
}
}
static private List<File> getFileListing(File aStartingDir) throws FileNotFoundException {
validateDirectory(aStartingDir);
List<File> result = new ArrayList<File>();
File[] filesAndDirs = aStartingDir.listFiles();
List<File> filesDirs = Arrays.asList(filesAndDirs);
Iterator<File> filesIter = filesDirs.iterator();
File file = null;
while (filesIter.hasNext()) {
file = filesIter.next();
result.add(file); //always add, even if directory
if (!file.isFile()) {
//must be a directory
//recursive call!
List<File> deeperList = getFileListing(file);
result.addAll(deeperList);
}
}
Collections.sort(result);
return result;
}
static private void validateDirectory(File aDirectory) throws FileNotFoundException {
if (aDirectory == null) {
throw new IllegalArgumentException("Directory should not be null.");
}
if (!aDirectory.exists()) {
throw new FileNotFoundException("Directory does not exist: " + aDirectory);
}
if (!aDirectory.isDirectory()) {
throw new IllegalArgumentException("Is not a directory: " + aDirectory);
}
if (!aDirectory.canRead()) {
throw new IllegalArgumentException("Directory cannot be read: " + aDirectory);
}
}
private static final void copyStream(InputStream source, OutputStream dest){
int bufferSize = 1024;
int bytes;
byte[] buffer;
buffer = new byte[bufferSize];
try {
while ((bytes = source.read(buffer)) != -1) {
if (bytes == 0) {
bytes = source.read();
if (bytes < 0)
break;
dest.write(bytes);
dest.flush();
continue;
}
dest.write(buffer, 0, bytes);
dest.flush();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}