DatLoader.java example

Explorer
scisoft-core-master
/*
 * Copyright (c) 2012 Diamond Light Source Ltd.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 */

package uk.ac.diamond.scisoft.analysis.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;

import org.eclipse.dawnsci.analysis.api.io.ScanFileHolderException;
import org.eclipse.january.IMonitor;
import org.eclipse.january.dataset.Dataset;
import org.eclipse.january.dataset.DatasetFactory;
import org.eclipse.january.dataset.ILazyDataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class loads a dat data files where:
 * 
 * 0. The file is ascii/utf-8
 * 1. The header is a section at the start and starting with a #
 * 2. The footer is a section at the end and starting with a #
 * 3. The data lines consist only of numbers separated by whitespace, 2 columns or more.
 * 4. The names of the data sets are defined in the last line of the header. They are separated
 *    by white space but if commas are used as well these will be stripped from the names.
 *    A single space without a comma is not included as data set names may contain spaces.
 *    
 *    
 *    Example: (see also unit tests)
 *    
 *    
# DIAMOND LIGHT SOURCE
# Instrument: I20-XAS Date: Mon, 19 Jul 2010 13:54:16 BST
# Ring energy: 56.66749 GeV
# Initial ring current: 72.23874 mA
# Filling mode: 50.05388
# Wiggler gap selected: 0.00000 mm
#  
# Primary slits:  vertical gap= 0.00000 mm; horizontal gap= 0.00000 mm; vertical offset= 0.00000 mm; horizontal offset= 0.00000 mm
# Secondary slits:  vertical gap= 0.00000 mm; horizontal gap= 0.00000 mm; vertical offset= 0.00000 mm; horizontal offset= 0.00000 mm
# Experimental slits:  vertical gap= 0.00000 mm; horizontal gap= 0.00000 mm; vertical offset= 0.00000 mm; horizontal offset= 0.00000 mm
#  
# Optic Hutch Mirrors Coating: 77.14589850662986
# Monochromator crystal cut: 84.35454401226806
# Incident angle for Harmonic Rejection Mirrors:     90.46 mrad
# Harmonic rejection mirrors coating: 57.76820936133954
#
# Ascii output file name: 'FeKedge_1_15.dat'
# Nexus output file: '/scratch/users/data/2010/cm1903-4/Experiment_1/nexus/FeKedge_1_15.nxs'
# The XML files, ScanParameters, SampleParameters, DetectorParameters, OutputParameters
# are stored in the nexus file.
#
# Sample name: Please set a sample name...
# Please add a description...
#
# Detector: Ge (XSPRESS)
#
# Dark current intensity (Hz): I0    150   It    230   Iref    135
# Dark current has been automatically removed from counts in main scan (I0,It,Iref)
#
# Energy          	I0              	It              	Iref            	ln(I0/It)       	ln(It/Iref)     	FF              	FF/I0           	Integration Time	Element 0       	Element 1       	Element 2       	Element 3       	Element 4       	Element 5       	Element 6       	Element 7       	Element 8       	Element 9       	Element 10      	Element 11      	Element 12      	Element 13      	Element 14      	Element 15      	Element 16      	Element 17      	Element 18      	Element 19      	Element 20      	Element 21      	Element 22      	Element 23      	Element 24      	Element 25      	Element 26      	Element 27      	Element 28      	Element 29      	Element 30      	Element 31      	Element 32      	Element 33      	Element 34      	Element 35      	Element 36      	Element 37      	Element 38      	Element 39      	Element 40      	Element 41      	Element 42      	Element 43      	Element 44      	Element 45      	Element 46      	Element 47      	Element 48      	Element 49      	Element 50      	Element 51      	Element 52      	Element 53      	Element 54      	Element 55      	Element 56      	Element 57      	Element 58      	Element 59      	Element 60      	Element 61      	Element 62      	Element 63      	
6912.0000       	134878.0        	2040284.0       	295077.5        	-2.716          	1.934           	10270610.99     	10555624.6977   	0.5000          	-12522.13       	-39259.72       	240563.80       	-18371.26       	-23133.33       	-26645.71       	-8850.84        	-32861.71       	564452.59       	478611.82       	226456.06       	168938.08       	102221.88       	-1823.24        	-4091.32        	-47298.75       	726493.32       	-52899.04       	-32855.20       	593012.73       	199522.96       	202352.74       	185485.28       	38464.16        	-65122.61       	-10684.71       	-1507.41        	423263.14       	-4528.40        	350455.46       	-21235.13       	-75253.78       	-4209.37        	165226.15       	35165.22        	-78922.48       	-10288.62       	-77462.30       	-6722.40        	354852.04       	418246.19       	-30032.34       	-75643.93       	-3506.16        	92636.17        	-41200.23       	301540.78       	460221.12       	989244.43       	102729.58       	23667.98        	327453.07       	-42080.14       	102140.45       	-22449.90       	407986.95       	601299.66       	573668.62       	157472.43       	551740.78       	258796.50       	117267.94       	87473.48        	512949.58       	
6917.0000       	111091.0        	1680648.0       	295086.5        	-2.717          	1.740           	8069498.24      	9446808.2869    	0.5000          	421871.76       	-17924.10       	-23503.70       	-18711.85       	-21437.16       	97863.36        	-7528.52        	90575.20        	562247.53       	9261.75         	-10753.65       	-6481.09        	511078.54       	-2886.82        	188671.06       	-76532.81       	144969.57       	591234.90       	-77537.48       	-13101.69       	78814.69        	529177.36       	573390.24       	992058.81       	169218.60       	-24463.02       	-76107.58       	144951.34       	492050.24       	-6852.50        	-61568.14       	-19371.64       	268265.42       	-43395.58       	-50644.15       	442379.96       	-25656.06       	-20971.66       	124481.27       	30026.80        	-9798.52        	-44064.34       	62571.57        	629673.44       	440609.56       	274190.73       	-19712.58       	247120.50       	-10970.01       	-18210.44       	-6132.86        	-52240.73       	-19340.80       	169917.83       	-87544.39       	-14745.72       	369390.43       	-2243.35        	78356.91        	-618.21         	-73468.71       	113662.67       	-32029.68       	217965.70       	
6922.0000       	108474.0        	1641726.0       	295080.5        	-2.717          	1.716           	9099845.78      	10278670.8288   	0.5000          	388648.16       	672346.44       	523813.82       	179272.73       	143237.35       	-23828.01       	-26426.86       	-9288.31        	130818.56       	10848.06        	80126.47        	199052.08       	-54048.36       	562720.47       	428897.71       	157519.51       	-49771.42       	-38585.22       	198793.59       	-41702.60       	-55170.40       	328924.55       	-38532.78       	638121.31       	82337.46        	-19266.79       	374849.82       	36781.83        	40839.37        	459302.23       	13776.24        	-60335.77       	-52474.61       	-3892.66        	308830.00       	170752.88       	-12940.66       	533910.29       	-32203.15       	105197.46       	-4579.55        	767756.14       	470172.54       	192161.92       	54539.36        	270370.56       	-25394.73       	-13256.13       	-28438.05       	264131.67       	-5296.58        	154772.18       	-24307.41       	-12717.96       	-3569.43        	164294.45       	216975.65       	389819.72       	-43662.48       	-61559.76       	-5179.87        	94455.12        	68302.55        	-31194.92       	
6927.0000       	108183.0        	1637764.0       	295081.5        	-2.717          	1.714           	6913071.88      	10655300.4342   	0.5000          	252997.35       	146202.54       	-20143.06       	166291.42       	478239.89       	-15245.91       	241066.30       	892037.59       	30055.41        	26569.29        	-18963.93       	-52547.75       	-7778.36        	665512.66       	321108.96       	-7330.62        	-4992.67        	-72195.55       	-33564.24       	36970.05        	-19453.16       	145710.39       	7210.57         	271347.83       	134907.53       	559452.65       	-22718.89       	44485.74        	311239.99       	648890.80       	-43491.14       	365036.73       	103054.76       	85826.55        	-10587.68       	171858.66       	219394.94       	113446.85       	76539.88        	212918.00       	-30767.21       	100902.77       	-57468.26       	-29552.13       	114026.72       	356607.93       	16867.18        	-21828.87       	-28344.87       	-39659.98       	-78626.70       	-7579.99        	-18246.21       	94921.76        	36276.76        	-11083.56       	-12283.48       	81419.01        	-60623.67       	116170.67       	-27800.29       	21024.00        	-39.34          	-600.72         	
6932.0000       	106587.0        	1613746.0       	295083.5        	-2.717          	1.699           	10105332.26     	9346514.1243    	0.5000          	70999.97        	671639.66       	100746.97       	-50466.13       	180646.73       	371747.26       	27040.41        	-38820.46       	210770.91       	109409.64       	426746.80       	-25636.44       	709006.61       	-43193.62       	158345.44       	236952.14       	-26740.35       	-13963.00       	351078.03       	214539.53       	217651.74       	480419.04       	-1349.22        	162630.78       	9663.93         	247932.24       	-53587.21       	-22906.08       	-478.60         	536088.63       	-488.26         	372896.99       	15890.97        	114261.48       	3446.13         	-15841.29       	411066.81       	-74108.10       	-19437.23       	732304.44       	-8511.52        	-20428.18       	112549.69       	64122.86        	23501.66        	-8777.49        	426582.51       	356181.24       	-16.91          	-20362.83       	-60131.00       	548368.66       	-14238.60       	380402.00       	-3020.60        	-2341.35        	88064.37        	924556.90       	-56480.55       	294788.70       	-35404.98       	158376.77       	43115.64        	157527.98       	

It is also legal to have no header section at all and just columns of white space separated numbers.
In this case the columns will be labelled Column_1...Column_N.
 */
public class DatLoader extends AbstractFileLoader {
	
	protected static final Logger logger = LoggerFactory.getLogger(DatLoader.class);
	
	private static final String  FLOAT = "([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|(0\\.)";
	transient protected final Pattern DATA;

	protected Map<String,String>        metadataMap;
	protected List<String>              footer;
	protected Map<String, List<Double>> vals;
	protected int                       columnIndex;

	public DatLoader() {
		DATA  = Pattern.compile("^(("+FLOAT+")"+getDelimiter()+")+("+FLOAT+")$");
	}
	
	/**
	 * @param fileName
	 */
	public DatLoader(final String fileName) {
		this();
		metadataMap = new HashMap<String,String>(7);
		footer   = new ArrayList<String>(7);
		
		// Important must use LinkedHashMap as order assumes is insertion order.
		vals   = new LinkedHashMap<String, List<Double>>();
		setFile(fileName);
	}

	@Override
	protected void clearMetadata() {
		metadata = null;
		if (metadataMap != null)
			metadataMap.clear();
		if (footer != null)
			footer.clear();
		if(vals != null)
			vals.clear();
	}

	@Override
	public DataHolder loadFile() throws ScanFileHolderException {
		return loadFile((IMonitor)null);
	}
	
	/**
	 * Function that loads in the standard SRS datafile
	 * 
	 * @return The package which contains the data that has been loaded
	 * @throws ScanFileHolderException
	 */
	@Override
	public DataHolder loadFile(final IMonitor mon) throws ScanFileHolderException {
        final DataHolder result = loadFile(null, mon);
		return result;
	}

	private DataHolder loadFile(final String name, final IMonitor mon) throws ScanFileHolderException {
		
		// first instantiate the return object.
		final DataHolder result = new DataHolder();
		
		// then try to read the file given
		BufferedReader in = null;
		try {
			in = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
			
			boolean readingFooter = false;
			
			String line	= parseHeaders(in, name, mon);
			int columns = vals.size();
			if (columns == 0) throw new ScanFileHolderException("Cannot read header for data set names!");

			// Read data
			int count = 0;

			DATA: while (line != null) {
				if (!monitorIncrement(mon)) {
					throw new ScanFileHolderException("Loader cancelled during reading!");
				}
				
				line = line.trim();
				if (!readingFooter && DATA.matcher(line).matches()) {
					
					if (line.startsWith("#")) {
						readingFooter = true;
						break DATA;
					}

					if (!loadLazily) {
						final String[] values = line.split(getDelimiter());
						if (columnIndex>-1 && name!=null) {
						    final String value = values[columnIndex];
						    vals.get(name).add(Utils.parseDouble(value.trim()));
						} else {
							if (values.length != columns) {
								throw new ScanFileHolderException("Data and header must be the same size!");
							}
							final Iterator<String> it = vals.keySet().iterator();
							for (String value : values) {
								vals.get(it.next()).add(Utils.parseDouble(value.trim())); 
							}
						}
					}
					count++;
					
				}
				
				line = in.readLine();
			}
			
			// Footer
			footer.clear();
			while ((line =in.readLine()) != null) {	
				if (readingFooter) {
					if (line.startsWith("#")) {
						footer.add(line);
						continue;
					}
					throw new ScanFileHolderException("Every line in the footer must start with #");
				}
			}

			for (final String n : vals.keySet()) {
				ILazyDataset data;
				if (loadLazily) {
					data = createLazyDataset(n, -1, new int[] { count }, new DatLoader(fileName));
				} else {
					data = DatasetFactory.createFromList(vals.get(n));
					data.setName(n);
				}
				result.addDataset(n, data);
			}

			if (loadMetadata) {
				createMetadata();
				result.setMetadata(metadata);
			}
			return result;
			
		} catch (Exception e) {
			throw new ScanFileHolderException("DatLoader.loadFile exception loading  " + fileName, e);
			
		} finally {
			try {
				if (in!=null) in.close();
			} catch (IOException e) {
				throw new ScanFileHolderException("Cannot close stream from file  " + fileName, e);
			}
		}
	}

	/**
	 * May override to support different file formats.
	 * @return the delimiter
	 */
	protected String getDelimiter() {
		return "\\s+";
	}

	public Dataset loadSet(final String path, final String name, final IMonitor mon) throws Exception {
		
		setFile(path);
		/**
		 * TODO Instead of **loading everything each time**, we should get the column
		 * number from the name, and write the algorithm to extract 
		 */
		final DataHolder dh = loadFile(name, mon);
		return dh.getDataset(name);
	}

	/**
	 * There are no efficiency gains in using this method, it reads everything in and garbage
	 * collects what is not needed.
	 */
	public Map<String,ILazyDataset> loadSets(String path, List<String> names, IMonitor mon) throws Exception {
		
		setFile(path);
		/**
		 * TODO Instead of **loading everything each time**, we should get the column
		 * number from the name, and write the algorithm to extract 
		 */
		final DataHolder dh = loadFile(mon);
		final Map<String,ILazyDataset> ret = dh.toLazyMap();
		ret.keySet().retainAll(names);
		return ret;
	}	


	@Override
	public void loadMetadata(final IMonitor mon) throws IOException {

		final BufferedReader br = new BufferedReader(new FileReader(new File(fileName)));
		int count = 1;
		try {
			try {
				parseHeaders(br, null, mon);
			} catch (Exception e) {
				throw new IOException(e);
			}
			// We assume the rest of the lines not starting with # are all
			// data lines in getting the meta data. We do not parse these lines.
			String line=null;
			while ((line = br.readLine()) != null) {	
				line = line.trim();
				if (line.startsWith("#")) break;
				count++;
			}
			
		} finally {
			br.close();
		}
		createMetadata(count);
	}
	
	private void createMetadata() {
		createMetadata(-1);
	}
	
	private void createMetadata(int approxSize) {
		metadata = new ExtendedMetadata(new File(fileName));
		metadata.setMetadata(metadataMap);
		for (Entry<String, List<Double>> e : vals.entrySet()) {
			if (approxSize>-1 &&  e.getValue().size()<1) {
			    metadata.addDataInfo(e.getKey(), approxSize);
			} else {
			    metadata.addDataInfo(e.getKey(), e.getValue().size());
			}
		}
	}

	protected static final Pattern SCAN_LINE = Pattern.compile("#S \\d+ .*");
	protected static final Pattern DATE_LINE = Pattern.compile("#D (Sun|Mon|Tue|Wed|Thu|Fri|Sat) [a-zA-Z]+ \\d+ .*");
	/**
	 * This method parses the headers. It tries to throw an exception
	 * if it is sure an SRS file is found. Also it looks in the headers
	 * and if it finds "#S" and "#D" it thinks that the file is a multi-scan
	 * spec file and throws an exception.
	 * 
	 * Example of multi-scan spec file characteristic lines:
	   #S 1  ascan  pvo -0.662 1.338  20 0.1
       #D Sat Apr 02 10:19:13 2011
	 * 
	 * @param in
	 * @param name
	 * @param mon
	 * @return last line
	 * @throws Exception
	 */
	protected String parseHeaders(final BufferedReader in, final String name, IMonitor mon) throws Exception {
		
		String line = in.readLine();
		if (line == null)
			throw new ScanFileHolderException("No lines found");
		if (line.trim().startsWith("&")) throw new Exception("Cannot load SRS files with DatLoader!");
		metadataMap.clear();
		vals.clear();
		
		List<String> header = new ArrayList<String>(31);
		
		boolean foundHeaderLine = false;
		boolean wasScanLine     = false;
		// TODO clarify why this is a while loop
		while (line.startsWith("#") || "".equals(line.trim())) {
			
			try {
				if ("".equals(line.trim())) continue;
				foundHeaderLine = true;

				if (!monitorIncrement(mon)) {
					throw new ScanFileHolderException("Loader cancelled during reading!");
				}
				
				if (wasScanLine && DATE_LINE.matcher(line.trim()).matches()) {
					throw new ScanFileHolderException("This file is a multi-scan spec file - use SpecLoader instead!");
				}
				wasScanLine = SCAN_LINE.matcher(line.trim()).matches();
				
				header.add(line);
				
				// This caused problems with some of B18's files, so changing the methodology a little
//				if (line.indexOf("=")>-1) {
//					metaData.put(line.substring(0,line.indexOf("=")-1), line.substring(line.indexOf("=")+1));
//				} else if (line.indexOf(":")>-1) {
//					metaData.put(line.substring(0,line.indexOf(":")-1), line.substring(line.indexOf(":")+1));
//				}
		
				if (line.contains(":")) {
					String[] parts = line.split(":");
					if (parts.length > 1) {
						String key = parts[0].replace("#", "");
						String value = parts[1];
						for (int p = 2; p < parts.length; p++) {
							value = value+":"+parts[p];
						}
						metadataMap.put(key.trim(),value.trim());
					}
				}
				
			} finally {
			    line = in.readLine();
			}
		}

		if (header.size() < 1) {
			if (!foundHeaderLine) {
				createDefaultHeaders(line);
			}
			return line;
		}

        createHeaders(header, line, name);		
        		
		return line;
	}
	
	protected void createHeaders(final List<String> header, final String line, final String name) {
		
		final String lastHeaderLine = header.get(header.size()-1);
		final String[] values = line.trim().split(getDelimiter());
		
		if (name!=null) {
			this.columnIndex = -1;
			vals.put(name, new ArrayList<Double>(89));
			
			// Busy line, been looking at too much python...
			List<String> headers = new ArrayList<String>(Arrays.asList(lastHeaderLine.substring(1).trim().split("\\s{2,}|\\,\\s+|\\t")));
			headers = removeQuotations(headers);
			
			if (values.length > headers.size()) {
				for (int j = headers.size(); j < values.length; j++) {
					headers.add("Unknown"+j);
				}			
			}
			for (int i = 0; i < headers.size(); i++) {
				if (headers.get(i).equals(name)) {
					columnIndex = i;
					break;
				}
			}
            		    
		} else {
			createValues(vals, lastHeaderLine);

		    // Check first line and headers are the same, sometimes the value names are not
			// provided in parsable syntax
			if (values.length > vals.size()) {
				for (int j = vals.size(); j < values.length; j++) {
					vals.put("Unknown"+j, new ArrayList<Double>(89));
				}
			}
		}
	}

	private List<String> removeQuotations(List<String> header) {
		
		if (header ==null || header.isEmpty()) return header;
		
		final List<String> ret = new ArrayList<String>(header.size());
		
		for (String name : header) {
			ret.add(removeQuotations(name));
		}
		
		return ret;
	}
	
	protected String removeQuotations(String name) {
		name = name.trim();
		if (name.startsWith("\"")) name = name.substring(1);
		if (name.endsWith("\""))   name = name.substring(0, name.length()-2);
        return name;		
	}

	protected void createDefaultHeaders(String line) {
		final String[] values = line.trim().split(getDelimiter());
		this.columnIndex = -1;
		for (int i = 0; i < values.length; i++) {
			vals.put("Column_"+(i+1), new ArrayList<Double>(89));
		}
	}

	protected void createValues(Map<String, List<Double>> v, String header) {
		
		// Two or more spaces or a comma and zero more more space
		final String[] headers = header.substring(1).trim().split("\\s{2,}|\\,\\s*|\\t");
		
		for (String name : headers) {
			name = removeQuotations(name);
			v.put(name, new ArrayList<Double>(89));
		}
	}
}