/*******************************************************************************
* Copyright 2014 Miami-Dade County
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.sharegov.cirm.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class finds all regex pattern matches in a file and outputs them as match;count table.
* In the match;count table, each match is unique and count is defined as number of occurrences in the file.
*
* @author Thomas Hilpold
*
*/
public class UniqueRegexpFileSearch
{
//public static String REGEXP = "\"([0-9]+)\"";
//"C:\_CiRM\3. Working\T67 Email Lists\missingAutoAssignmentAxioms\csr_COM.log"
public static String FILE = "C:/work/cirmservices/src/ontology/csr.owl";
public static String REGEXP = "(CM[0-9]{3,8})";
public static void main(String[] argv) {
Pattern pattern;
File f;
if (argv.length > 0)
{
f = new File(argv[0]);
} else
f = new File(FILE);
if (!f.canRead()) throw new IllegalStateException("Cannot read: " + f.getAbsolutePath());
if (argv.length > 1)
{
pattern = Pattern.compile(argv[1]);
} else
pattern = Pattern.compile(REGEXP);
UniqueRegexpFileSearch us = new UniqueRegexpFileSearch();
System.out.println("UniqueRegexpFileSearch for all unique matches for " + REGEXP + " in file " + f.getAbsolutePath());
SortedMap<String, Integer> results = us.find(pattern, f);
//System.out.println("Total matches in file: " + total);
System.out.println("Match\tCount");
for(Map.Entry<String, Integer> result : results.entrySet())
{
System.out.print(result.getKey() + "\t");
System.out.println(result.getValue());
}
}
/**
* Finds all matches for regexp pattern in file and counts them.
*
* @param pattern
* @param file
* @return a sorted map from match (sorted) to count of match
* @Throw RuntimeException if problems during file read
* @Throw IllegalStateException if file not canRead
*
*/
public SortedMap<String, Integer> find(Pattern pattern, File file)
{
if (!file.canRead()) throw new IllegalArgumentException("Cannot read: " + file);
TreeMap<String, Integer> results = new TreeMap<String, Integer>();
int total = 0;
try
{
FileReader fr = new FileReader(file);
BufferedReader r = new BufferedReader(fr);
String line = null;
do
{
line = r.readLine();
if (line != null)
{
Matcher matcher = pattern.matcher(line);
while (matcher.find())
{
String match = matcher.group();
total ++;
Integer count = results.get(match);
if (count == null)
count = 1;
else
count++;
results.put(match, count);
}
}
} while (line != null);
r.close();
return results;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
}