/**
*Copyright 2016 Nabarun Mondal
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.noga.njexl.lang.extension.dataaccess;
import com.noga.njexl.lang.Interpreter;
import com.noga.njexl.lang.JexlArithmetic;
import com.noga.njexl.lang.JexlException;
import com.noga.njexl.lang.Script;
import com.noga.njexl.lang.extension.datastructures.ListSet;
import com.noga.njexl.lang.extension.SetOperations;
import com.noga.njexl.lang.extension.datastructures.Tuple;
import com.noga.njexl.lang.extension.TypeUtility;
import com.noga.njexl.lang.extension.datastructures.XList;
import com.noga.njexl.lang.extension.iterators.RangeIterator;
import com.noga.njexl.lang.extension.oop.ScriptClassBehaviour.Arithmetic;
import java.io.BufferedReader;
import java.io.File;
import java.nio.file.Files;
import java.util.*;
import java.util.regex.Pattern;
/**
* A generic Data Matrix class to manipulate on data
* Created by noga on 03/04/15.
*/
public class DataMatrix {
/**
* A Generic diff structure for any sort of matrices
*/
public static class MatrixDiff{
/**
* Is this diff qualifies as a diff or not
* @return true if it is, false if it is not
*/
public boolean diff(){
if ( lr != null && !lr.isEmpty() ){
return true;
}
if ( rl!= null && !rl.isEmpty() ){
return true;
}
if ( id!= null && !id.isEmpty() ){
return true;
}
return false ;
}
/**
* where the key list : left - right did not match
*/
public List lr ;
/**
* The the key list : right - left did not match
*/
public List rl ;
/**
* The key intersection portion if any,
* but where the equality broke down
*/
public List id ;
@Override
public String toString(){
return String.format("%s : < %s %s %s>", diff(), lr, rl, id );
}
}
/**
* A standard interface to load data matrices from data sources
*/
public interface DataLoader{
/**
* loads a data matrix
* @param location the location from where it should load it
* @param args corresponding args
* @return a data matrix
* @throws Exception if fails
*/
DataMatrix matrix(String location,Object...args) throws Exception;
}
/**
* A standard implementation of data loader,
* used to load text like files
*/
public static class TextDataLoader implements DataLoader{
@Override
public DataMatrix matrix(String location, Object... args) throws Exception {
String sep="\t";
boolean header = true ;
if ( args.length > 0 ){
sep = args[0].toString();
if ( args.length > 1 ){
header = TypeUtility.castBoolean(args[1], false);
}
}
BufferedReader reader = (BufferedReader)TypeUtility.fopen(new File(location).getPath() );
ListSet cols = null;
String line;
if ( header ){
line = reader.readLine();
String[] words = line.split(sep);
cols = new ListSet(Arrays.asList(words));
if ( words.length != cols.size() ){
String message = "Some column names are not unique !! Repeated columns :\n %s \n, Use with header-less mode" ;
Object diff = SetOperations.list_d(words,cols );
throw new Exception(String.format(message, diff )) ;
}
}
ArrayList rows = new ArrayList();
int colSize = (cols != null)? cols.size() : 0 ;
while ( true ){
line = reader.readLine();
if ( line == null ){ break; }
String[] words = line.split( sep,-1);
if ( header && words.length != colSize ){
String message = "Invalid no of columns in data row! Expected :%d, Actual %d" ;
throw new Exception(String.format(message, cols.size(), words.length )) ;
}
List row = Arrays.asList(words);
rows.add(row);
}
reader.close();
System.gc();
System.runFinalization();
if ( header ){
return new DataMatrix(rows,cols);
}
return new DataMatrix(rows);
}
}
/**
* Various registered data loaders
*/
public static final HashMap<Pattern,DataLoader> dataLoaders = new HashMap<>();
static {
final TextDataLoader textDataLoader = new TextDataLoader() ;
dataLoaders.put(Pattern.compile(".+\\.tsv$",Pattern.CASE_INSENSITIVE), textDataLoader);
dataLoaders.put(Pattern.compile(".+\\.csv$",Pattern.CASE_INSENSITIVE), textDataLoader);
dataLoaders.put(Pattern.compile(".+\\.txt$",Pattern.CASE_INSENSITIVE), textDataLoader);
}
/**
* The columns of the data matrix
*/
public final ListSet<String> columns;
/**
* The actual data rows, they are not including the column
*/
public final List<List> rows;
/**
* Names mapping for tuple creation
*/
public final Map<String,Integer> names;
/**
* For comparison, one needs to generate the row key.
* If confused, see the @link{http://en.wikipedia.org/wiki/Candidate_key}
* They generated and gets stored here
*/
public Map<String,List<Integer>> keys;
/**
* The Factory of data matrix
* @param location from where to be loaded
* @param args the corresponding args
* @return a data matrix
* @throws Exception in case fails
*/
public static DataMatrix loc2matrix(String location, Object... args) throws Exception{
StringBuffer buffer = new StringBuffer();
for ( Pattern p : dataLoaders.keySet() ){
if ( p.matcher(location).matches()){
DataLoader dataLoader =dataLoaders.get(p);
return dataLoader.matrix(location,args);
}
buffer.append(p).append(";");
}
System.err.printf("No pattern matched for [%s] for Data Load!\n Registered Patterns are : %s\n",
location, buffer);
return null;
}
/**
* Creates a data matrix
* @param rows the rows of data
* @param cols the column headers
*/
public DataMatrix(List<List> rows,ListSet<String> cols){
this.rows = rows;
this.columns = cols;
this.names = new HashMap<>();
for ( int i = 0 ; i < this.columns.size(); i++ ){
this.names.put( this.columns.get(i),i);
}
}
/**
* This would be created column header free
* @param rows only rows of data
*/
public DataMatrix(List<List> rows){
this.rows = rows;
this.columns = new ListSet<>();
this.names = new HashMap<>();
for ( int i = 0 ; i < rows.get(0).size(); i++ ){
String si = String.valueOf(i) ;
this.columns.add(si);
this.names.put(si,i);
}
}
/**
* A row, as a tuple structure.
* See @link{http://en.wikipedia.org/wiki/Tuple}
* @param r the row index
* @return the tuple corresponding to the row
*/
public Tuple tuple(int r){
if ( r >= rows.size() ){
return null;
}
Tuple t = new Tuple(names, rows.get(r));
return t;
}
// short for the tuple
public Tuple t(int r){
return tuple(r);
}
/**
* A column
* @param c the column index
* @return the whole column row by row
*/
public List c(int c){
return c(c,null);
}
/**
* Selects only specific rows,
* @param c the column index
* @param agg these rows will be selected
* @return list of selected row values for the column
*/
public List c(int c, Object agg ){
if ( agg != null ){
agg = TypeUtility.from(agg);
for ( int i = 0 ; i < ((List)agg).size() ; i++ ){
Object o = ((List)agg).get(i);
((List)agg).set(i, TypeUtility.castInteger( o ));
}
}
XList l = new XList();
for ( int r = 0; r < rows.size() ; r++ ){
if ( agg == null ||
agg!= null && ((List)agg).contains( r )){
Object value = rows.get(r).get(c);
l.add(value);
}
}
return l;
}
private static class SelectSetup{
private Interpreter.AnonymousParam anon;
private ListSet<Integer> colIndexes;
}
private SelectSetup setup(Object... args) throws Exception{
SelectSetup selectSetup = new SelectSetup();
selectSetup.colIndexes = new ListSet<>();
// no point going further here... always will be empty
if ( rows.isEmpty() ){ return selectSetup ; }
if ( args.length > 0 ){
if ( args[0] instanceof Interpreter.AnonymousParam){
selectSetup.anon = (Interpreter.AnonymousParam)args[0];
args = TypeUtility.shiftArrayLeft(args,1);
}
if ( args.length == 1 && JexlArithmetic.isListOrSetOrArray( args[0] ) ){
args = TypeUtility.array(args[0]);
}
}
if ( args.length == 0 ){
//select all
for ( int i = 0 ; i < columns.size();i++ ){
selectSetup.colIndexes.add(i);
}
}else {
// select specific
for (int i = 0; i < args.length; i++) {
int pos = -1;
if (args[i] instanceof Integer) {
pos = (int) args[i];
} else if ( args[i] instanceof RangeIterator){
Iterator<Long> itr = (RangeIterator)args[i];
while(itr.hasNext()){
selectSetup.colIndexes.add(itr.next().intValue());
}
continue;
}
else {
pos = columns.indexOf(args[i]);
}
if (pos < 0) {
throw new Exception("No such header : " + args[i]);
}
selectSetup.colIndexes.add(pos);
}
}
return selectSetup;
}
/**
* The revered select function
* @param args parameters
* @return selected rows
* @throws Exception in error
*/
public List select(Object...args) throws Exception {
if ( args.length == 0 ){
return rows;
}
SelectSetup setup = setup(args);
// now do the stuff
List rs = _select_op_(setup.anon, setup.colIndexes);
return rs;
}
/**
* The sub-matrix function
* @param args parameters
* @return a data matrix
* @throws Exception in error
*/
public DataMatrix matrix(Object...args) throws Exception {
if ( args.length == 0 ){
return this; // risky? May be. I don't know
}
SelectSetup setup = setup(args);
ListSet nColumns = new ListSet();
for ( int c : setup.colIndexes ){
nColumns.add( columns.get(c) );
}
List rs = _select_op_(setup.anon,setup.colIndexes);
return new DataMatrix(rs,nColumns);
}
private List _select_op_(Interpreter.AnonymousParam anon, ListSet<Integer> colIndexes ) throws Exception{
// now do the stuff
XList rs = new XList();
HashMap<Integer,Tuple> selectedRows = new HashMap<>();
for ( int i = 0 ; i < rows.size();i++ ){
boolean broken = false ;
if ( anon != null ){
//process this ...
anon.setIterationContextWithPartial(this, tuple(i),i,rs);
Object ret = anon.execute();
if ( ret instanceof JexlException.Break ){
if ( ((JexlException.Break) ret).hasValue ) {
broken = true;
}else{
break;
}
}
if ( ret instanceof JexlException.Continue ){
// if the continue statement has any value, try to see what needs to be done
ret = ((JexlException.Continue)ret).hasValue ;
}
if ( !TypeUtility.castBoolean(ret,false)){
continue;
}
// get back the values if over written ?
selectedRows.put(i,(Tuple)anon.getVar(Script._ITEM_));
}
ArrayList cs = new ArrayList();
List<String> dataRow = rows.get(i);
for ( int c : colIndexes ){
Object val = dataRow.get(c) ;
if ( anon == null ){
cs.add(val);
}else {
Object var = selectedRows.get(i).get(c);
// avoid stupidity, add Tuple value always
cs.add(var);
}
}
rs.add(cs);
if ( broken ){ break; }
}
return rs;
}
/**
* This is how you set key to a data matrix
* @param args parameters
* @return a keyed matrix
* @throws Exception in error
*/
public DataMatrix keys(Object...args) throws Exception{
keys = new HashMap<>();
SelectSetup setup = setup(args);
// now do the stuff
for ( int i = 0 ; i < rows.size();i++ ){
String key ;
if ( setup.anon != null ){
//process this ...
setup.anon.setIterationContext(this, tuple(i),i);
Object ret = setup.anon.execute();
key = ret.toString();
}
else{
String sep = "Ø";
StringBuffer buf = new StringBuffer();
for ( int j = 0 ; j < columns.size(); j++ ){
if ( setup.colIndexes.contains(j) ){
buf.append( rows.get(i).get(j) ).append(sep) ;
}
}
key = buf.toString();
}
if ( !keys.containsKey(key)){
keys.put(key, new ArrayList<>());
}
keys.get(key).add(i);
}
return this;
}
/**
* This is how you aggregate rows, to merge them into effective single row
* @param args parameters
* @return an aggregated matrix based on keys
* @throws Exception in error
*/
public DataMatrix aggregate(Object...args) throws Exception {
if ( keys == null ){
keys();
}
SelectSetup setup = setup(args);
HashSet<Integer> colIndices = setup.colIndexes;
Interpreter.AnonymousParam anon = setup.anon;
ListSet aColumns = new ListSet();
for ( int c : colIndices ) {
// add this column
aColumns.add(columns.get(c));
}
HashMap<String,List<Integer>> aKey = new HashMap<>();
ArrayList aRows = new ArrayList();
// aggregate rows
int rowNum = 0 ;
for ( String key : keys.keySet() ){
XList rowData = new XList();
List<Integer> agg = keys.get(key);
for ( int c : colIndices ){
List data = c(c,agg);
Object value ;
if ( anon != null ){
anon.setIterationContext(this,data,c);
Object ret = anon.execute();
value = ret ;
}else{
Object[] a = TypeUtility.sqlmath(data);
value = a[2] ;
}
//create a row with aggregated rows for the column
rowData.add(value);
}
ArrayList r = new ArrayList();
r.add(rowNum);
aRows.add(rowData);
aKey.put(key, r);
rowNum++;
}
DataMatrix dm = new DataMatrix(aRows,aColumns);
dm.keys = aKey ;
return dm;
}
/**
* The API to do a matrix key diff
* @param d1 1st data matrix
* @param d2 2nd data matrix
* @return two sets in array, symmetric delta of d1 and d2 [ (d1-d2) , (d2-d1) ]
* @throws Exception in error
*/
public static Set[] key_diff( DataMatrix d1, DataMatrix d2 ) throws Exception{
if ( d1.keys == null ){
d1.keys();
}
if ( d2.keys == null ){
d2.keys();
}
Set[] retVal = new Set[2];
retVal[0] = SetOperations.set_d(d1.keys.keySet(), d2.keys.keySet());
retVal[1] = SetOperations.set_d(d2.keys.keySet(), d1.keys.keySet() );
return retVal ;
}
@Override
public String toString() {
return String.format( "DataMatrix<Cols:%d , Rows:%d>", columns.size(), rows.size() );
}
/**
* Matrix diff, generates a MatrixDiff structure
* @param args parameters
* @return a matrix diff
* @throws Exception in error
*/
public static MatrixDiff diff2(Object... args) throws Exception {
Interpreter.AnonymousParam anon = null;
if ( args.length == 0 ){
return null;
}
if ( args.length > 0 ){
if ( args[0] instanceof Interpreter.AnonymousParam){
anon = (Interpreter.AnonymousParam)args[0];
args = TypeUtility.shiftArrayLeft(args,1);
}
}
DataMatrix left = (DataMatrix)args[0];
DataMatrix right = (DataMatrix)args[1];
if ( left.keys == null ){
left.keys();
}
if ( right.keys == null ){
right.keys();
}
MatrixDiff matrixDiff = new MatrixDiff();
// Keys which are not in left but in right
Set[] diffKey = key_diff(left,right);
ArrayList d1 = new ArrayList();
for ( Object i : diffKey[0] ){
d1.add( left.keys.get(i) );
}
ArrayList d2 = new ArrayList();
for ( Object i : diffKey[1] ){
d2.add( right.keys.get(i) );
}
matrixDiff.lr = d1;
matrixDiff.rl = d2 ;
List diff = new ArrayList();
//now the rest
Set intersection = SetOperations.set_i( left.keys.keySet(), right.keys.keySet() );
for ( Object i : intersection ){
List<Integer> l = left.keys.get(i);
List<Integer> r = right.keys.get(i);
if ( l.size() != r.size() && l.size() != 1 ){
throw new Exception("After Keying, multiple rows with same key! did you forget aggregate?");
}
int lIndex = l.get(0) ;
int rIndex = r.get(0) ;
Tuple L = left.tuple(lIndex);
Tuple R = right.tuple(rIndex);
if ( anon != null ){
Object context = new Object[]{ left, right };
Object cur = new Object[]{ L, R };
Object index = new Object[] { lIndex , rIndex };
anon.setIterationContext(context,cur,index);
Object ret = anon.execute();
if ( !TypeUtility.castBoolean(ret,false)){
// log it
diff.add( new Object[] { left.rows.get(lIndex) , right.rows.get(rIndex) } );
}
}else{
Set colIntersect = SetOperations.set_i(L.names.keySet(), R.names.keySet() );
for ( Object c : colIntersect ){
Object valLeft = L.get(c.toString());
Object valRight = R.get(c.toString());
if ( !Objects.equals(valLeft , valRight )){
diff.add( new Object[] { left.rows.get(lIndex) , right.rows.get(rIndex) } );
}
}
}
}
matrixDiff.id = diff ;
return matrixDiff;
}
/**
* Matrix diff, generates a MatrixDiff structure
* @param args (anonymous param , matrix2), or ( matrix2 )
* @return a matrix diff
* @throws Exception in error
*/
public MatrixDiff diff(Object... args) throws Exception {
if ( args.length == 0 ) return null;
if (args.length == 1 && args[0] instanceof DataMatrix ){
return diff2(this, args[0]);
}
return diff2(args[0],this,args[1]);
}
}