/*
* Copyright (c) 2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.batchresearch;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class Analyze {
public static void main(String[] args) throws Exception {
PrintWriter output = new PrintWriter(new OutputStreamWriter(
new GZIPOutputStream(new FileOutputStream(args[2])),
"ISO-8859-1"));
int runs = Integer.parseInt(args[0]);
int count = 0;
int round = 0;
int length = 0;
for (int i = 0; i < runs; i++) {
BufferedReader input = new BufferedReader(new InputStreamReader(
new GZIPInputStream(new FileInputStream(args[1])),
"ISO-8859-1"));
Map<String, IntWrap> theDict = new HashMap<String, IntWrap>();
try {
String prevUrl = null;
String line = null;
boolean collect = false;
while ((line = input.readLine()) != null) {
length += line.length();
String message = null;
String url = null;
String mode = null;
String phase = null;
String[] s = line.split("\t");
if (s.length == 4) {
url = s[0];
mode = s[1];
phase = s[2];
message = s[3];
} else {
System.out.printf("Bad line: %s\n", line);
System.out.printf("Length: %d\n", length);
break;
}
if ("Q".equals(mode)) {
continue;
}
if (!url.equals(prevUrl)) {
count++;
prevUrl = url;
int newRound = (count / 100) * 100;
if (round != newRound) {
round = newRound;
// System.out.printf("%d %d\n", i, round);
}
collect = message.equals("NO PARSE ERRORS");
if (collect) {
System.out.println(url);
}
}
if (!collect) {
continue;
}
if (message.hashCode() % runs == i) {
IntWrap wrap = theDict.get(message);
if (wrap == null) {
theDict.put(message, new IntWrap());
} else {
wrap.increment();
}
}
}
input.close();
} catch (IOException e) {
e.printStackTrace();
}
for (Map.Entry<String, IntWrap> entry : theDict.entrySet()) {
double ratio = ((double) entry.getValue().getValue())
/ ((double) count);
if (ratio > 0.0001)
output.printf("%6.4f\t%s\n", ratio, entry.getKey());
}
output.close();
}
}
}