package hu.u_szeged.nlp.pos; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import edu.stanford.nlp.io.IOUtils; public class Util { /** * adott szo csak irasjeleket tartalmaz-e */ public static boolean isPunctation(String spelling) { for (int i = 0; i < spelling.length(); ++i) { if (Character.isLetterOrDigit(spelling.charAt(i))) { return false; } } return true; } /** * 16 15-18 minden szam < 32 */ public static boolean isDate(String spelling) { for (String s : spelling.split("-")) { if (Integer.parseInt(s) > 31) { return false; } } return true; } static Map<String, Set<MorAna>> readCorpus(String file) { BufferedReader reader = null; String line = null; Set<MorAna> morAnas = null; String[] splitted = null; Map<String, Set<MorAna>> corpus = null; corpus = new TreeMap<String, Set<MorAna>>(); try { reader = new BufferedReader(new InputStreamReader(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(file), "UTF-8")); while ((line = reader.readLine()) != null) { morAnas = new TreeSet<MorAna>(); splitted = line.split("\t"); for (int i = 1; i < splitted.length - 1; i++) { morAnas.add(new MorAna(splitted[i], splitted[i + 1])); i++; } corpus.put(splitted[0], morAnas); } reader.close(); } catch (IOException e) { e.printStackTrace(); } return corpus; } static Map<String, Integer> readFrequencies(String file) { BufferedReader reader = null; String line = null; String[] splitted = null; Map<String, Integer> frequencies = null; frequencies = new TreeMap<String, Integer>(); try { reader = new BufferedReader(new InputStreamReader(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(file), "UTF-8")); while ((line = reader.readLine()) != null) { splitted = line.split("\t"); frequencies.put(splitted[0], Integer.parseInt(splitted[1])); } } catch (Exception e) { e.printStackTrace(); } return frequencies; } public static Set<String> loadPunctations() { Set<String> punctations = null; punctations = new HashSet<String>(); String[] puncts = { "!", ",", "-", ".", ":", ";", "?", "–" }; for (String punct : puncts) { punctations.add(punct); } return punctations; } public static Set<String> loadMorPhonDir() { Set<String> morPhonDir = null; morPhonDir = new HashSet<String>(); String[] morPhons = new String[] { "talány", "némber", "sün", "fal", "holló", "felhő", "kalap", "hely", "köd" }; for (String morPhon : morPhons) { morPhonDir.add(morPhon); } return morPhonDir; } public static Map<String, String> readCorrDic(String file) { BufferedReader reader = null; String line = null; String[] splitted = null; Map<String, String> dictionary = null; dictionary = new TreeMap<String, String>(); try { reader = new BufferedReader(new InputStreamReader(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(file), "UTF-8")); while ((line = reader.readLine()) != null) { splitted = line.split("\t"); dictionary.put(splitted[0], splitted[1]); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return dictionary; } }