package edu.stanford.nlp.ie.util; import edu.stanford.nlp.util.logging.Redwood; import java.util.*; import java.io.*; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.util.Generics; /** * A script that goes through a data file and looks for instances * where place, place should have the , tagged as well. * * @author jrfinkel */ public class FixLocation { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(FixLocation.class); public static final BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); static String inputFilename = null; static String outputFilename = null; private FixLocation() {} // static class public static void main(String[] args) throws Exception { if (args.length < 1) { log.info("Input filename?"); inputFilename = in.readLine(); } else { inputFilename = args[0]; } if (args.length < 2) { log.info("Output filename?"); outputFilename = in.readLine(); } else { outputFilename = args[1]; } String[][] cols = readFile(inputFilename); fix(cols); print(cols); } public static String[][] readFile(String filename) throws Exception { String file = IOUtils.slurpFile(filename); String[] lines = file.split("\n"); String[][] cols = new String[lines.length][]; for (int i = 0; i < lines.length; i++) { cols[i] = lines[i].split("\\s+"); } return cols; } public static void fix(String[][] cols) throws Exception { for (int i = 1; i < cols.length-1; i++) { if (cols[i-1].length < 2) { continue; } if (cols[i].length < 2) { continue; } if (cols[i+1].length < 2) { continue; } String prevLabel = cols[i-1][1]; String curWord = cols[i][0]; String nextLabel = cols[i+1][1]; if (prevLabel.equals("LOCATION") && nextLabel.equals("LOCATION") && curWord.equals(",")) { query(cols, i); } } } public static BufferedReader answers; static { try { answers = new BufferedReader(new FileReader("answers")); } catch (Exception e) {} } private static Map<String,String> cache = Generics.newHashMap(); public static void query(String[][] cols, int pos) throws Exception { String pre = ""; if (cols[pos-1][0].matches("[-A-Z]*")) { cols[pos][1] = "LOCATION"; return; } for (int i = pos-1; i >= 0 && cols[i].length >= 2; i--) { if (cols[i][1].equals("LOCATION")) { if (pre.equals("")) { pre = cols[i][0]; } else { pre = cols[i][0] + " " + pre; } } else { break; } } String post = ""; for (int i = pos+1; i < cols.length && cols[i].length >= 2; i++) { if (cols[i][1].equals("LOCATION")) { if (post.equals("")) { post = cols[i][0]; } else { post = post + " " + cols[i][0]; } } else { break; } } String ans = (answers == null) ? "": answers.readLine(); String loc = pre+","+post+" ?"; log.info(loc); if (ans.equals(loc)) { String response = answers.readLine(); log.info(response); if (ans.equalsIgnoreCase("Y")) { cols[pos][1] = "LOCATION"; } } else { ans = cache.get(loc); if (ans == null) { if (in.readLine().equalsIgnoreCase("Y")) { cache.put(loc, "Y"); cols[pos][1] = "LOCATION"; } else { cache.put(loc, "N"); } } else if (ans.equalsIgnoreCase("Y")) { cols[pos][1] = "LOCATION"; log.info("Y"); } } } public static void print(String[][] cols) throws Exception { BufferedWriter out = new BufferedWriter(new FileWriter(outputFilename)); for (String[] col : cols) { if (col.length >= 2) { out.write(col[0] + "\t" + col[1] + "\n"); } else { out.write("\n"); } } out.flush(); out.close(); } }