package edu.cmu.geolocator.resource.gazindexing; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.CharBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Scanner; import java.util.Map.Entry; import edu.cmu.geolocator.common.CollectionSorting; import edu.cmu.geolocator.io.GetReader; /** * This function is to generate the most ambiguous K gazetteer entries, according to * allCOuntries.txt other language field, and alternativenames field. * * Method: simply count the number of same word accurance. 1. lowercase 2. count. 3. output K top * words. * * @author indri * */ public class GazStringStat { public static void main(String argv[]) throws IOException { HashMap<String, Long> locations = new HashMap<String, Long>(30000000); Scanner s = new Scanner(new File("GeoNames/allCountries.txt")); String asciiStr = "", temp; s.useDelimiter("\t"); int count = 0; int total = 0; while (s.hasNext()) { total++; if (total % 18000000 == 0) System.out.println(total / 18); temp = s.next(); if (count == 1) { // System.out.println(asciiStr); asciiStr = temp.toLowerCase(); } if (count == 14) { // This is for storing the balanced or population prefered. if (true) if (locations.containsKey(asciiStr)) locations.put(asciiStr, locations.get(asciiStr) + Long.parseLong(temp)); else locations.put(asciiStr, Long.parseLong(temp)); //This is for storing the ambiguity prefered locations. if(false) if (locations.containsKey(asciiStr)) locations.put(asciiStr, locations.get(asciiStr)+1); else locations.put(asciiStr, 1L); } count = count % 18; count++; } ArrayList<Entry<String, Long>> as = new ArrayList<Entry<String, Long>>(locations.entrySet()); ArrayList<Entry<String, Long>> sortedLocations = CollectionSorting.rankIntArray(as); int i = 0; while (i++ < 10000) System.out.println(sortedLocations.get(i)); } }