/*
* Apache License
* Version 2.0, January 2004
* http://www.apache.org/licenses/
*
* Copyright 2013 Aurelian Tutuianu
* Copyright 2014 Aurelian Tutuianu
* Copyright 2015 Aurelian Tutuianu
* Copyright 2016 Aurelian Tutuianu
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package rapaio.experiment;
import rapaio.core.tools.DVector;
import rapaio.data.*;
import rapaio.data.stream.VSpot;
import rapaio.io.Csv;
import rapaio.sys.WS;
import java.io.IOException;
import java.util.*;
public class FrameAnalysis {
private final int chunkSize = 50;
private final int bins = 256;
private final int maxVars = Integer.MAX_VALUE;
private final int targetIndex = 1933;
public Frame buildCsvFrame(Csv csv, String file) throws IOException {
String[] varNames = csv.withEndRow(1000).read(file).varNames();
csv.withEndRow(Integer.MAX_VALUE);
int start = 0;
Nominal name = Nominal.empty().withName("name");
Nominal type = Nominal.empty().withName("type");
Index count = Index.empty().withName("count");
Index missing = Index.empty().withName("missing");
List<Var> h1 = new ArrayList<>();
List<Var> h2 = new ArrayList<>();
for (int i = 0; i < bins; i++) {
h1.add(Numeric.empty().withName("h1_" + i));
h2.add(Numeric.empty().withName("h2_" + i));
}
Var target = csv.withSkipCols(n -> n != targetIndex).read(file).var(0);
while (start < varNames.length && start < maxVars) {
int pos = start;
csv.withSkipCols(n -> !((n >= pos && n < pos + chunkSize && n < varNames.length && n < maxVars)));
Frame vs = csv.read(file);
vs.varStream().forEach(var -> {
WS.println(var.name());
name.addLabel(var.name());
type.addLabel(var.type().name());
int countValue;
switch (var.type()) {
case NOMINAL:
case ORDINAL:
countValue = var.levels().length;
break;
case INDEX:
case BINARY:
countValue = (int) var.stream().mapToInt().distinct().count();
break;
case STAMP:
countValue = (int) var.stream().mapToLong(VSpot::stamp).distinct().count();
break;
case NUMERIC:
countValue = (int) var.stream().mapToDouble().distinct().count();
break;
default:
countValue = (int) var.stream().mapToString().distinct().count();
}
count.addIndex(countValue);
missing.addIndex((int) var.stream().incomplete().count());
double[] h1v = new double[bins];
double[] h2v = new double[bins];
double[][] h = new double[][]{h1v, h2v};
switch (var.type()) {
case BINARY:
var.stream().complete().forEach(s -> h[target.index(s.row()) - 1][s.index()]++);
break;
case INDEX:
case NUMERIC:
case ORDINAL:
double min = var.stream().complete().mapToDouble().min().getAsDouble();
double max = var.stream().complete().mapToDouble().max().getAsDouble();
double step = (max - min) / bins;
var.stream().complete().forEach(s -> {
int bin = (int) Math.floor((s.value() - min) / step);
if (bin == bins)
bin--;
h[target.index(s.row()) - 1][bin]++;
});
break;
case STAMP:
long min2 = var.stream().complete().mapToLong(VSpot::stamp).min().getAsLong();
long max2 = var.stream().complete().mapToLong(VSpot::stamp).max().getAsLong();
long step2 = (max2 - min2) / bins;
var.stream().complete().forEach(s -> {
int bin = (int) Math.floor((s.value() - min2) / step2);
if (bin == bins)
bin--;
h[target.index(s.row()) - 1][bin]++;
});
break;
case NOMINAL:
DVector dv1 = DVector.fromCount(false, var.stream().complete().filter(s -> target.index(s.row()) == 1).toMappedVar());
double[] v1 = dv1.streamValues().skip(1).sorted().toArray();
for (int i = 0; i < v1.length; i++) {
h[0][i < bins ? i : bins - 1] += v1[i];
}
DVector dv2 = DVector.fromCount(false, var.stream().complete().filter(s -> target.index(s.row()) == 2).toMappedVar());
double[] v2 = dv2.streamValues().skip(1).sorted().toArray();
for (int i = 0; i < v1.length; i++) {
h[1][i < bins ? i : bins - 1] += v2[i];
}
break;
default:
HashMap<String, Integer> counts = new HashMap<>();
var.stream().filter(s -> target.index(s.row()) == 1).mapToString().forEach(txt -> {
if (!counts.containsKey(txt))
counts.put(txt, 0);
counts.put(txt, counts.get(txt) + 1);
});
TreeMap<Integer, List<String>> reverse = new TreeMap<>();
counts.entrySet().forEach(e -> {
if (!reverse.containsKey(e.getValue()))
reverse.put(e.getValue(), new ArrayList<>());
reverse.get(e.getValue()).add(e.getKey());
});
int p = 0;
for (Map.Entry<Integer, List<String>> e : reverse.entrySet()) {
for (String key : e.getValue()) {
h[0][p < bins ? p++ : bins - 1] += e.getKey();
}
}
HashMap<String, Integer> counts2 = new HashMap<>();
var.stream().filter(s -> target.index(s.row()) == 1).mapToString().forEach(txt -> {
if (!counts2.containsKey(txt))
counts2.put(txt, 0);
counts2.put(txt, counts2.get(txt) + 1);
});
TreeMap<Integer, List<String>> reverse2 = new TreeMap<>();
counts2.entrySet().forEach(e -> {
if (!reverse2.containsKey(e.getValue()))
reverse2.put(e.getValue(), new ArrayList<>());
reverse2.get(e.getValue()).add(e.getKey());
});
int p2 = 0;
for (Map.Entry<Integer, List<String>> e : reverse2.entrySet()) {
for (String key : e.getValue()) {
h[1][p2 < bins ? p2++ : bins - 1] += e.getKey();
}
}
}
for (int i = 0; i < bins; i++) {
h1.get(i).addValue(h1v[i]);
h2.get(i).addValue(h2v[i]);
}
});
// next chunk
start += chunkSize;
}
List<Var> vars = new ArrayList<>();
vars.add(name);
vars.add(type);
vars.add(count);
vars.add(missing);
vars.addAll(h1);
vars.addAll(h2);
return SolidFrame.byVars(vars);
}
}